ORFEOME_Seq Pipeline¶

InĀ [1]:
# Improved version of Bowtie2 running from Kim's lab
# 1. Import all the libsraries
# 2. setting directories
# 3. Parse FASTA file 
# 4. running bowtie2 in parallel
# 5. Merge read counts to one csv file
# 6. Making summary of log files
# 7. Getting read count from SAM files
# 8. Merging all read count into one xlsx file 
InĀ [2]:
# Setting environment
# 1. Download Anaconda
# 2. Download Jupyter, Python, Pylance etc in Virtual Studio Code extension
# 3. Create a new conda environement in the terminal (conda create -n pipeline)
# 4. Download openpyxl and r-base
# 5. Go to setting and link r-base to least R 4.2.0
# 6. Download all the necessary python packages
# 7. Download all the necessary R packages 

Table of Contents¶

  1. Code initialization
    • Import all the libraries
    • Setting directories
    • Index reference genome and iterate sample names
    • Chemotherapeutic categories and colors
      • Assign color map to excel file
    • Parse FASTA files



  1. Pre-processing
    • FASTQ – Pre QC
    • Pre-QC MultiQC analysis
    • Adapter Trimming (Cutadapt)
    • Quality score trimming (Trimmomatic)
    • FASTQ – Post QC



  1. Alignment
    • Parallel alignment (Bowtie2)
    • Post-QC MultiQC analysis
    • Merging alignment logs
    • Alignment summary



  1. Post-Alignment
    • SAM files to BAM files (samtools)
    • Processing BAM files (samtools)
    • Collect read count from processed bam files
    • Merging samples read counts
    • SAM & BAM Files compression (gzip, tar)
    • SAM & BAM Files decompression (optional)



  1. Normalization
    • Installing R packages (optional)
    • Import R packages
    • Gene count summary
    • Read count normalization (EdgeR, Bioinfokit)
    • Noise detection
    • Noise removal from dataframe
    • Correlation matrix clustering (Pre-normalization)
      • Removing unnecessary columns
    • Box & Violin Plot (Pre-normalization)
    • Quantile Normalization
    • Box & Violin Plot (Post-Quantile Normalization)
    • Batch correction (Limma)
    • Correlation matrix clustering (Post-normalization)



  1. Dimensional reduction analysis
    • Interactive 3D PCA
    • Interactive 2D PCA
    • Interactive 2D T-SEN
    • Interactive 2D UMAP
    • 2D UMAP (Two legends)
    • 2D UMAP (Single legend)
    • Scatter Plot



  1. Statistical Analysis
    • Mean Log2FC calculation
    • Individual Log2FC calculation
    • Statistical Function
    • Differentially Expressed Genes (DEG) calculation



Code initialization ¶

Import all the libraries ¶

InĀ [3]:
#Python default packages
import os
import pwd
import glob
import subprocess
import csv
import random
from concurrent.futures import ProcessPoolExecutor
import concurrent.futures
import multiprocessing
from multiprocessing import Pool, cpu_count
import threading
import signal
import sys
import tempfile
import warnings
from itertools import product
from itertools import combinations
import warnings
import gzip
import shutil
import tarfile
import time
from collections import OrderedDict
InĀ [4]:
from numba.core.errors import NumbaDeprecationWarning
# Suppress specific Numba deprecation warnings
warnings.filterwarnings('ignore', category=NumbaDeprecationWarning)
InĀ [5]:
#Python additional packages
import multiqc
import pysam
from tqdm import tqdm
from bioinfokit.analys import norm
from IPython.display import display, HTML
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib import gridspec
from matplotlib.patches import Patch
import matplotlib.ticker as mticker
from matplotlib.ticker import MaxNLocator, LogFormatter
from matplotlib_venn import venn3, venn2
from venn import venn
import pandas as pd
import numpy as np
from Bio import SeqIO
from kmodes.kmodes import KModes
from scipy.stats import rankdata
from scipy.stats import chi2, t, norm
from scipy.stats import pearsonr, spearmanr, kendalltau, multiscale_graphcorr
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.interpolate import interp1d
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
from scipy.stats import gamma, norm, poisson
from scipy.optimize import brentq
from scipy.optimize import minimize
from scipy.spatial import distance_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import coo_matrix
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import webcolors
from webcolors import hex_to_rgb
import umap
import hdbscan
%matplotlib inline
# %matplotlib agg
# %matplotlib agg
InĀ [6]:
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Suppress DeprecationWarnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)
InĀ [7]:
plt.rcParams['axes.labelsize'] = 35  # For X and Y labels
plt.rcParams['xtick.labelsize'] = 30# For X-axis tick labels
plt.rcParams['ytick.labelsize'] = 30 # For Y-axis tick labels
plt.rcParams['legend.fontsize'] = 30 # For the legend
plt.rcParams['axes.titlesize'] = 30 # For the plot title
plt.rcParams['axes.linewidth'] = 1.5 # For the axes lines
plt.rcParams['font.size'] = 20 # For all text in the plot
plt.rcParams['axes.labelpad'] = 20
plt.rcParams['svg.fonttype'] = 'none'
InĀ [984]:
plt.rcParams['svg.fonttype'] = 'none'
InĀ [8]:
# # DAVID related packages
# import logging
# import traceback as tb
# import suds.metrics as metrics
# from tests import *
# from suds import *
# from suds.client import Client
# from datetime import datetime
InĀ [9]:
#Python package for R
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import StrVector, FloatVector
from rpy2.robjects import pandas2ri
from rpy2.robjects import FactorVector
from rpy2.robjects.vectors import DataFrame
pandas2ri.activate()
import rpy2.robjects.numpy2ri as numpy2ri
from rpy2.robjects.conversion import localconverter

Setting directories ¶

InĀ [10]:
# Define the base directory variable
directory = "RQ023682" 

# Define User ID
uid = os.getuid()
user_info = pwd.getpwuid(uid)
user_id = user_info.pw_name

# Define the base directory
base_dir = f"~/rnaseq_analysis/{directory}"

# Expand the tilde (~) in the base directory path
base_dir = os.path.expanduser(base_dir)

# Get the list of RS-* folders
folders = glob.glob(os.path.join(base_dir, "RS-*"))

# Create the output .Cut directory
output_cut_dir = os.path.join(base_dir, f"{directory}_cut")
os.makedirs(output_cut_dir, exist_ok=True)

# Create the output .Trim directory
output_trim_dir = os.path.join(base_dir, f"{directory}_trim")
os.makedirs(output_trim_dir, exist_ok=True)

# Create the output .sam directory
output_sam_dir = os.path.join(base_dir, f"{directory}_sam")
os.makedirs(output_sam_dir, exist_ok=True)

# Create the output .bam directory
output_bam_dir = os.path.join(base_dir, f"{directory}_bam")
os.makedirs(output_bam_dir, exist_ok=True)

# Create the graph storing directory
graphs_files = os.path.join(base_dir, directory+ "_graphs")
os.makedirs(graphs_files, exist_ok=True)

# Create the graph storing for the original
graphs_files_original = os.path.join(graphs_files, "Original_Graph" )
os.makedirs(graphs_files_original, exist_ok=True)

# Create the graph storing for the statistical testing
graphs_files_stats = os.path.join(graphs_files, "Statistical_Graph" )
os.makedirs(graphs_files_stats, exist_ok=True)

# Create the database storing directory
database_files = os.path.join(base_dir, directory+ "_db")
os.makedirs(database_files, exist_ok=True)

# Create the database storing directory
database_files_original = os.path.join(database_files, "Original_db")
os.makedirs(database_files_original, exist_ok=True)

# Create the database storing directory
database_files_stats = os.path.join(database_files, "Statistical_db")
os.makedirs(database_files_stats, exist_ok=True)

# Setting working directory
PATH = base_dir
os.chdir(PATH)

Index reference genome and iterate sample names ¶

InĀ [11]:
# Extract the reference directory from the index path
reference_dir = f"/home/{user_id}/rnaseq_analysis/Reference/9.1_delta/"
os.chdir(reference_dir)
ref_name = "9'1_delta_final_LP_BxB.fa"
ref_dir = os.path.join(reference_dir, ref_name)

# Split the file name by '.' and take the first part as the index base name
index_base_name = ref_name.split('.')[0]

# Check if the Bowtie2 index files already exist
index_files_exist = all(os.path.exists(f"{index_base_name}.{ext}") for ext in ["1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"])

if not index_files_exist:
    # Command to run bowtie2-build with the reference_dir as input and index_base_name as output
    cmd = ["bowtie2-build", ref_dir, index_base_name]

    try:
        # Run bowtie2-build using subprocess
        subprocess.run(cmd, check=True)
        print("bowtie2-build completed successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Error occurred while running bowtie2-build: {e}")
else:
    print("Bowtie2 index files already exist. Skipping the indexing process.")

reference_index_dir = os.path.join(reference_dir, index_base_name)

# Create an empty dictionary
sample_key = {}

# Iterate sample names 
for folder in folders:
    r1_names = glob.glob(os.path.join(folder, "*R1_001.fastq.gz"))
    r2_names = glob.glob(os.path.join(folder, "*R2_001.fastq.gz"))
    if r1_names and r2_names and len(r1_names) == len(r2_names):
        for r1_name, r2_name in zip(r1_names, r2_names):
            r1 = os.path.basename(r1_name).split("_")[1]
            r2 = os.path.basename(r2_name).split("_")[1]
            if r1 == r2:
                folder_name = os.path.basename(folder)
                if folder_name not in sample_key:
                    sample_key[folder_name] = []
                sample_key[folder_name].append(r1)

sample_key = dict(sorted(sample_key.items()))
Bowtie2 index files already exist. Skipping the indexing process.

Chemotherapeutic categories and colors ¶

InĀ [12]:
name_list = ['DMSO',
             'Paclitaxel', 'Cisplatin', 'TFT', 'FdU', 'EdU',
             'Doxorubicin', '5FU', 'Carboplatin', 'Bleomycin', 'Etoposide',
             'MitomycinC', 'Carmustine', 'Irinotecan', '6mercaptopurine',
             'Vinblastine', 'TAS102']
InĀ [13]:
# Final category
drug_category = { 
    "Control": ["DMSO", "Baseline", "mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative", "Serumfree"],
    "Antimetabolite": ["TFT", "TAS102", "FdU", "EdU", "5FU", "6mercaptopurine"],
    "DNA cross linking agent": ["Cisplatin", "Carboplatin", "Carmustine", "MitomycinC"],
    "DNA strand break agent": ["Doxorubicin", "Etoposide", "Irinotecan", "Bleomycin"],
    "Microtubule inhibitor": ["Paclitaxel","Vinblastine"],
}

# Final category color code
category_colors = { 
    "Control": "#E3E3DD", # Pantone 9101 C Color
    "Antimetabolite": "#B03A2E",  # Close to Medium Carmine
    "DNA cross linking agent": "#28B463",  # Medium Sea Green
    "DNA strand break agent": "#7C3B97",  # Cadmium Violet
    "Microtubule inhibitor": "#2F86C1",  # Boyzone
}

# Final chemotherapy color code
drug_color_map = { 
    # Control
    "DMSO": "#EAEAE6",  # White Sail (DMSO)
    "Baseline": "#F1F1EF", # Bleached Silk (Baseline)
    "mCherryPositive&BFPNegative" : "#F5F5F4", # Ivory (mCherry+/BFP-)
    "mCherryNegative&BFPNegative" : "#F5F5F4", # Ivory (mCherry-/BFP-)
    "Serumfree" : "#F9F9F8", # Light Ivory (Serum Free)

    # Antimetabolite
    "TFT": "#EC7063",  # Terra Cotta (5-FU)
    "TAS102": "#FADBD8",  # Light grayish red (TAS102)
    "FdU": "#FDEDEC",  # Provincial Pink (FdU)
    "EdU": "#F5B7B1",  # Beauty Bush (TFT)
    "5FU": "#F5AFA8", # Sundown (5FU)
    "6mercaptopurine": "#F1948A",  # Sweet Pink (6-mercaptopurine)

    # Cross Linking Drug
    "Cisplatin": "#82E0AA",  # Pearl Aqua (Carboplatin)
    "Carboplatin": "#ABEBC6",  # Magic Mint (Cisplatin)
    "Carmustine" : "#D5F5E3",  # Aero Blue (Mitomycin C) 
    "MitomycinC": "#EAFAF1",  # Pale green (Carmustine)

    # DNA strand-break agent
    "Doxorubicin": "#BB8FCE",  # Amethyst Show (Bleomycin)
    "Bleomycin" : "#EBDEF0",  # White Lilac (Doxorubicin)
    "Etoposide": "#F5EEF8",  # AliceBlue color (Etoposide)
    "Irinotecan": "#D7BDE2",  # Pretty Petunia (Irinotecan) 

    # Microtubule inhibitor
    "Paclitaxel": "#93C8EC",  # Fail Whale (Paclitaxel)
    "Vinblastine": "#D6EAF8",  # cyan-blue (Vinblastine)
}

Assign color map to excel file ¶

InĀ [14]:
# Helper function to get color name from RGB
def get_color_name_from_hex(hex_code):
    try:
        return webcolors.hex_to_name(hex_code, spec='css3')
    except ValueError:
        return "Unknown"

# Helper function to convert RGB to CMYK
def rgb_to_cmyk(rgb):
    r, g, b = rgb
    c = 1 - r / 255
    m = 1 - g / 255
    y = 1 - b / 255
    k = min(c, m, y)
    if k == 1:
        c = m = y = 0
    else:
        c = round((c - k) / (1 - k) * 100)
        m = round((m - k) / (1 - k) * 100)
        y = round((y - k) / (1 - k) * 100)
        k = round(k * 100)
    return c, m, y, k

# Create the DataFrame
data = []
for drug, hex_code in drug_color_map.items():
    rgb = hex_to_rgb(hex_code)
    color_name = get_color_name_from_hex(hex_code)
    cmyk = rgb_to_cmyk(rgb)

    category = None
    category_color = None
    for key, value in drug_category.items():
        if drug in (value if isinstance(value, list) else [value]):
            category = key
            category_color = category_colors[key]
            break

    data.append([drug, category, category_color, color_name, rgb, hex_code, cmyk])

columns = ["Drug", "Category", "Category_Color", "Color_Name", "RGB", "Drug_Hex_Code", "CMYK"]
color_df = pd.DataFrame(data, columns=columns)

# Save the DataFrame
color_df_path = os.path.join(base_dir, f"{directory}_color.xlsx")
color_df.to_excel(color_df_path)

Parse FASTA files ¶

InĀ [15]:
fasta_rows = []

for record in SeqIO.parse(ref_dir, "fasta"):
    # Get the gene name from the record description
    gene_name = record.description.split()[0]
    # Split the gene name by "_"
    gene_parts = gene_name.split("_")
    # Extract the additional columns
    ORFeome_ID = gene_parts[0]
    NCBI_no = gene_parts[1]
    group_no = gene_parts[2]
    gene_ID = "_".join(gene_parts[3:])
    # Count the number of bases in the sequence
    base_count = len(record.seq)
    # Calculate GC content
    gc_content = (record.seq.count("G") + record.seq.count("C")) / base_count * 100
    # Create a row for the CSV file
    fasta_row = [gene_name, ORFeome_ID, NCBI_no, group_no, gene_ID, base_count, gc_content]
    # Add the row to the list of CSV rows
    fasta_rows.append(fasta_row)

# Path to the output CSV file
fasta_file = os.path.join(base_dir, "fasta_index.csv")

# Write the gene names, base counts, and GC content to the CSV file
with open(fasta_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["ID", "ORF_ID", "NCBI", "Group", "Gene_Symbol", "Length", "GC_Content"])
    writer.writerows(fasta_rows)

print(f"CSV file saved to {fasta_file}")
CSV file saved to /home/harryjo/rnaseq_analysis/RQ023682/fasta_index.csv

Pre-processing ¶

FASTQ – Pre QC ¶

InĀ [16]:
# Create a list to store the FastQC subprocess instances
prefastqc_processes = []

# Specify the folder for FastQC output
folder_fastqc_dir = os.path.join(base_dir, f"{directory}_Pre_FASTQC")

# Create the FastQC output directory if it doesn't exist
os.makedirs(folder_fastqc_dir, exist_ok=True)

for folder in folders:
    r1_files = glob.glob(os.path.join(folder, "*R1_001.fastq.gz"))[0]
    r2_files = glob.glob(os.path.join(folder, "*R2_001.fastq.gz"))[0]

    if not r1_files or not r2_files:
        print(f"Skipping {os.path.basename(folder)} - R1 or R2 files not found.")
        continue

    folder_name = os.path.basename(folder)

    # Create a subfolder for the current sample's FastQC output
    sample_fastqc_dir = os.path.join(folder_fastqc_dir, folder_name)
    os.makedirs(sample_fastqc_dir, exist_ok=True)

    r1_files_name = os.path.basename(r1_files).split('.')[0]
    r2_files_name = os.path.basename(r2_files).split('.')[0]

    r1_exist = os.path.join(sample_fastqc_dir, f"{os.path.basename(r1_files_name)}_fastqc.html")
    r2_exist = os.path.join(sample_fastqc_dir, f"{os.path.basename(r1_files_name)}_fastqc.html")

    # Check if FastQC output already exists for R1 file
    if os.path.exists(r1_exist):
        print(f"Skipping {folder_name} - FastQC output already exists for R1")
    else:
        # Run FastQC for R1 file
        command_r1 = [
            "fastqc",
            "-f", "fastq",
            "--extract",
            "-t", "9",
            "-o", sample_fastqc_dir,
            r1_files
        ]
        process_r1 = subprocess.Popen(command_r1, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        prefastqc_processes.append(process_r1)

    # Check if FastQC output already exists for R2 file
    if os.path.exists(r2_exist):
        print(f"Skipping {folder_name} - FastQC output already exists for R2")
    else:
        # Run FastQC for R2 file
        command_r2 = [
            "fastqc",
            "-f", "fastq",
            "--extract",
            "-t", "9",
            "-o", sample_fastqc_dir,
            r2_files
        ]
        process_r2 = subprocess.Popen(command_r2, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        prefastqc_processes.append(process_r2)

# Wait for all subprocesses to finish and re-run ones with errors
for process in prefastqc_processes:
    _, stderr_output = process.communicate()
    if process.returncode != 0:
        print(f"Error message: {stderr_output.decode('utf-8')}")
        
        # Re-run the same FastQC command
        re_run_process = subprocess.Popen(
            process.args,  # Re-run the same command
            stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        _, re_run_stderr = re_run_process.communicate()
        if re_run_process.returncode != 0:
            print("Failed to re-run FastQC.")

print("FastQC for all R1 and R2 files completed.")
Skipping RS-03984980 - FastQC output already exists for R1
Skipping RS-03984980 - FastQC output already exists for R2
Skipping RS-03985010 - FastQC output already exists for R1
Skipping RS-03985010 - FastQC output already exists for R2
Skipping RS-03985029 - FastQC output already exists for R1
Skipping RS-03985029 - FastQC output already exists for R2
Skipping RS-03985030 - FastQC output already exists for R1
Skipping RS-03985030 - FastQC output already exists for R2
Skipping RS-03985031 - FastQC output already exists for R1
Skipping RS-03985031 - FastQC output already exists for R2
Skipping RS-03985032 - FastQC output already exists for R1
Skipping RS-03985032 - FastQC output already exists for R2
Skipping RS-03985033 - FastQC output already exists for R1
Skipping RS-03985033 - FastQC output already exists for R2
Skipping RS-03985034 - FastQC output already exists for R1
Skipping RS-03985034 - FastQC output already exists for R2
Skipping RS-03985035 - FastQC output already exists for R1
Skipping RS-03985035 - FastQC output already exists for R2
Skipping RS-03985036 - FastQC output already exists for R1
Skipping RS-03985036 - FastQC output already exists for R2
Skipping RS-03985037 - FastQC output already exists for R1
Skipping RS-03985037 - FastQC output already exists for R2
Skipping RS-03985038 - FastQC output already exists for R1
Skipping RS-03985038 - FastQC output already exists for R2
Skipping RS-03985039 - FastQC output already exists for R1
Skipping RS-03985039 - FastQC output already exists for R2
Skipping RS-03985040 - FastQC output already exists for R1
Skipping RS-03985040 - FastQC output already exists for R2
Skipping RS-03985041 - FastQC output already exists for R1
Skipping RS-03985041 - FastQC output already exists for R2
Skipping RS-04068630 - FastQC output already exists for R1
Skipping RS-04068630 - FastQC output already exists for R2
Skipping RS-04068631 - FastQC output already exists for R1
Skipping RS-04068631 - FastQC output already exists for R2
Skipping RS-04068632 - FastQC output already exists for R1
Skipping RS-04068632 - FastQC output already exists for R2
Skipping RS-04068633 - FastQC output already exists for R1
Skipping RS-04068633 - FastQC output already exists for R2
Skipping RS-04068634 - FastQC output already exists for R1
Skipping RS-04068634 - FastQC output already exists for R2
Skipping RS-04068635 - FastQC output already exists for R1
Skipping RS-04068635 - FastQC output already exists for R2
Skipping RS-04068636 - FastQC output already exists for R1
Skipping RS-04068636 - FastQC output already exists for R2
Skipping RS-03984981 - FastQC output already exists for R1
Skipping RS-03984981 - FastQC output already exists for R2
Skipping RS-03984982 - FastQC output already exists for R1
Skipping RS-03984982 - FastQC output already exists for R2
Skipping RS-03984983 - FastQC output already exists for R1
Skipping RS-03984983 - FastQC output already exists for R2
Skipping RS-03984984 - FastQC output already exists for R1
Skipping RS-03984984 - FastQC output already exists for R2
Skipping RS-03984985 - FastQC output already exists for R1
Skipping RS-03984985 - FastQC output already exists for R2
Skipping RS-03984986 - FastQC output already exists for R1
Skipping RS-03984986 - FastQC output already exists for R2
Skipping RS-03984987 - FastQC output already exists for R1
Skipping RS-03984987 - FastQC output already exists for R2
Skipping RS-03984988 - FastQC output already exists for R1
Skipping RS-03984988 - FastQC output already exists for R2
Skipping RS-03984989 - FastQC output already exists for R1
Skipping RS-03984989 - FastQC output already exists for R2
Skipping RS-03984990 - FastQC output already exists for R1
Skipping RS-03984990 - FastQC output already exists for R2
Skipping RS-03984991 - FastQC output already exists for R1
Skipping RS-03984991 - FastQC output already exists for R2
Skipping RS-03984992 - FastQC output already exists for R1
Skipping RS-03984992 - FastQC output already exists for R2
Skipping RS-03984993 - FastQC output already exists for R1
Skipping RS-03984993 - FastQC output already exists for R2
Skipping RS-03984995 - FastQC output already exists for R1
Skipping RS-03984995 - FastQC output already exists for R2
Skipping RS-03984996 - FastQC output already exists for R1
Skipping RS-03984996 - FastQC output already exists for R2
Skipping RS-03984997 - FastQC output already exists for R1
Skipping RS-03984997 - FastQC output already exists for R2
Skipping RS-03984998 - FastQC output already exists for R1
Skipping RS-03984998 - FastQC output already exists for R2
Skipping RS-03984999 - FastQC output already exists for R1
Skipping RS-03984999 - FastQC output already exists for R2
Skipping RS-03985000 - FastQC output already exists for R1
Skipping RS-03985000 - FastQC output already exists for R2
Skipping RS-03985001 - FastQC output already exists for R1
Skipping RS-03985001 - FastQC output already exists for R2
Skipping RS-03985002 - FastQC output already exists for R1
Skipping RS-03985002 - FastQC output already exists for R2
Skipping RS-03985003 - FastQC output already exists for R1
Skipping RS-03985003 - FastQC output already exists for R2
Skipping RS-03985004 - FastQC output already exists for R1
Skipping RS-03985004 - FastQC output already exists for R2
Skipping RS-03985005 - FastQC output already exists for R1
Skipping RS-03985005 - FastQC output already exists for R2
Skipping RS-03985006 - FastQC output already exists for R1
Skipping RS-03985006 - FastQC output already exists for R2
Skipping RS-03985007 - FastQC output already exists for R1
Skipping RS-03985007 - FastQC output already exists for R2
Skipping RS-03985008 - FastQC output already exists for R1
Skipping RS-03985008 - FastQC output already exists for R2
Skipping RS-03985009 - FastQC output already exists for R1
Skipping RS-03985009 - FastQC output already exists for R2
Skipping RS-03984975 - FastQC output already exists for R1
Skipping RS-03984975 - FastQC output already exists for R2
Skipping RS-03984976 - FastQC output already exists for R1
Skipping RS-03984976 - FastQC output already exists for R2
Skipping RS-03984977 - FastQC output already exists for R1
Skipping RS-03984977 - FastQC output already exists for R2
Skipping RS-03984978 - FastQC output already exists for R1
Skipping RS-03984978 - FastQC output already exists for R2
Skipping RS-03984979 - FastQC output already exists for R1
Skipping RS-03984979 - FastQC output already exists for R2
Skipping RS-03985011 - FastQC output already exists for R1
Skipping RS-03985011 - FastQC output already exists for R2
Skipping RS-03985012 - FastQC output already exists for R1
Skipping RS-03985012 - FastQC output already exists for R2
Skipping RS-03985013 - FastQC output already exists for R1
Skipping RS-03985013 - FastQC output already exists for R2
Skipping RS-03985014 - FastQC output already exists for R1
Skipping RS-03985014 - FastQC output already exists for R2
Skipping RS-03985015 - FastQC output already exists for R1
Skipping RS-03985015 - FastQC output already exists for R2
Skipping RS-03985016 - FastQC output already exists for R1
Skipping RS-03985016 - FastQC output already exists for R2
Skipping RS-03985017 - FastQC output already exists for R1
Skipping RS-03985017 - FastQC output already exists for R2
Skipping RS-03985018 - FastQC output already exists for R1
Skipping RS-03985018 - FastQC output already exists for R2
Skipping RS-03985019 - FastQC output already exists for R1
Skipping RS-03985019 - FastQC output already exists for R2
Skipping RS-03985020 - FastQC output already exists for R1
Skipping RS-03985020 - FastQC output already exists for R2
Skipping RS-03985021 - FastQC output already exists for R1
Skipping RS-03985021 - FastQC output already exists for R2
Skipping RS-03985022 - FastQC output already exists for R1
Skipping RS-03985022 - FastQC output already exists for R2
Skipping RS-03985023 - FastQC output already exists for R1
Skipping RS-03985023 - FastQC output already exists for R2
Skipping RS-03985024 - FastQC output already exists for R1
Skipping RS-03985024 - FastQC output already exists for R2
Skipping RS-03985025 - FastQC output already exists for R1
Skipping RS-03985025 - FastQC output already exists for R2
Skipping RS-03985026 - FastQC output already exists for R1
Skipping RS-03985026 - FastQC output already exists for R2
Skipping RS-03985027 - FastQC output already exists for R1
Skipping RS-03985027 - FastQC output already exists for R2
Skipping RS-03985028 - FastQC output already exists for R1
Skipping RS-03985028 - FastQC output already exists for R2
FastQC for all R1 and R2 files completed.

Pre-QC MultiQC analysis ¶

InĀ [17]:
# Specify the folder and file name for the MultiQC report
multiqc_output_dir = os.path.join(base_dir, f"{directory}_Pre-QC_MultiQC")
os.makedirs(multiqc_output_dir, exist_ok=True)
multiqc_report_file = os.path.join(multiqc_output_dir, f"{directory}_Pre-QC_multiqc_report.html")

# Generate the MultiQC report with --interactive flag
if os.path.exists(multiqc_report_file):
    print("Skipping MultiQC - Report file already exists.")
else:
    # Generate the MultiQC report with --interactive flag
    multiqc_command = [
        "multiqc",
        "--interactive",
        folder_fastqc_dir,
        "-o", multiqc_output_dir,
        "--filename", f"{directory}_Pre-QC_multiqc_report.html"
    ]
    subprocess.run(multiqc_command, check=True)

print("MultiQC report generation completed.")

# # Display the MultiQC report in the notebook
# display(HTML(filename=os.path.join(PATH, "multiqc_report.html")))
Skipping MultiQC - Report file already exists.
MultiQC report generation completed.

Adapter Trimming (Cutadapt) ¶

InĀ [18]:
# Define adapter sequences for R1 and R2
Truseq_adapter_sequence_R1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"
Truseq_adapter_sequence_R2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
# PCR1_primers_R1 = "CACCCACACAAAGGAAAAGGG"
# PCR1_primers_R2 = "CACATTGCCAAAAGACGGCA"
# PCR2_primers_R1 = "NNNNNCTCACAAGTTTGTACAAAAAAG"
# PCR2_primers_R2 = "NNNNNTGACCACTTTGTACAAGAAAG"

# List to store Cutadapt subprocess instances
cutadapt_processes = []

# Iterate over the RS-* folders
for folder in folders:
    r1_files = glob.glob(os.path.join(folder, "*R1_001.fastq.gz"))[0]
    r2_files = glob.glob(os.path.join(folder, "*R2_001.fastq.gz"))[0]

    if not r1_files or not r2_files:
        print(f"Skipping {os.path.basename(folder)} - R1 or R2 files not found.")
        continue

    folder_name = os.path.basename(folder)

    # Create a subfolder for the current sample's FastQC output
    sample_cut_dir = os.path.join(output_cut_dir, folder_name)
    os.makedirs(sample_cut_dir, exist_ok=True)

    # File names without extensions
    r1_files_name = os.path.basename(r1_files).split('.')[0]
    r2_files_name = os.path.basename(r2_files).split('.')[0]
    
    # Output file paths in the output_cut_dir
    r1_output_file = os.path.join(sample_cut_dir, f"{r1_files_name}.trimmed.fastq.gz")
    r2_output_file = os.path.join(sample_cut_dir, f"{r2_files_name}.trimmed.fastq.gz")

    if os.path.exists(r1_output_file) and os.path.exists(r2_output_file):
        print(f"Skipping {folder_name} - Cutadapt output already exists for R1 and R2")
    else:
        # Create a log directory inside the output_cut_dir
        log_directory = os.path.join(output_cut_dir, "cutadapt_logs")
        os.makedirs(log_directory, exist_ok=True)
        
        # Define log file path for stdout
        stdout_log_path = os.path.join(log_directory, f"{folder_name}_cutadapt_stdout.log")

        # Run Cutadapt for R1 and R2 files
        command = [
            "cutadapt",
            "-a", Truseq_adapter_sequence_R1,
            "-A", Truseq_adapter_sequence_R2,
            "-o", r1_output_file,
            "-p", r2_output_file,
            r1_files,
            r2_files
        ]
        
        # Open log file for stdout
        stdout_log_file = open(stdout_log_path, "w")
        
        # Run the process and store it in the list
        process = subprocess.Popen(command, stdout=stdout_log_file, stderr=subprocess.PIPE)
        cutadapt_processes.append(process)

# Wait for all Cutadapt subprocesses to finish
for process in cutadapt_processes:
    _, stderr_output = process.communicate()
    if process.returncode != 0:
        print(f"Error message: {stderr_output.decode('utf-8')}")

print("Cutadapt for all R1 and R2 files completed.")
Skipping RS-03984980 - Cutadapt output already exists for R1 and R2
Skipping RS-03985010 - Cutadapt output already exists for R1 and R2
Skipping RS-03985029 - Cutadapt output already exists for R1 and R2
Skipping RS-03985030 - Cutadapt output already exists for R1 and R2
Skipping RS-03985031 - Cutadapt output already exists for R1 and R2
Skipping RS-03985032 - Cutadapt output already exists for R1 and R2
Skipping RS-03985033 - Cutadapt output already exists for R1 and R2
Skipping RS-03985034 - Cutadapt output already exists for R1 and R2
Skipping RS-03985035 - Cutadapt output already exists for R1 and R2
Skipping RS-03985036 - Cutadapt output already exists for R1 and R2
Skipping RS-03985037 - Cutadapt output already exists for R1 and R2
Skipping RS-03985038 - Cutadapt output already exists for R1 and R2
Skipping RS-03985039 - Cutadapt output already exists for R1 and R2
Skipping RS-03985040 - Cutadapt output already exists for R1 and R2
Skipping RS-03985041 - Cutadapt output already exists for R1 and R2
Skipping RS-04068630 - Cutadapt output already exists for R1 and R2
Skipping RS-04068631 - Cutadapt output already exists for R1 and R2
Skipping RS-04068632 - Cutadapt output already exists for R1 and R2
Skipping RS-04068633 - Cutadapt output already exists for R1 and R2
Skipping RS-04068634 - Cutadapt output already exists for R1 and R2
Skipping RS-04068635 - Cutadapt output already exists for R1 and R2
Skipping RS-04068636 - Cutadapt output already exists for R1 and R2
Skipping RS-03984981 - Cutadapt output already exists for R1 and R2
Skipping RS-03984982 - Cutadapt output already exists for R1 and R2
Skipping RS-03984983 - Cutadapt output already exists for R1 and R2
Skipping RS-03984984 - Cutadapt output already exists for R1 and R2
Skipping RS-03984985 - Cutadapt output already exists for R1 and R2
Skipping RS-03984986 - Cutadapt output already exists for R1 and R2
Skipping RS-03984987 - Cutadapt output already exists for R1 and R2
Skipping RS-03984988 - Cutadapt output already exists for R1 and R2
Skipping RS-03984989 - Cutadapt output already exists for R1 and R2
Skipping RS-03984990 - Cutadapt output already exists for R1 and R2
Skipping RS-03984991 - Cutadapt output already exists for R1 and R2
Skipping RS-03984992 - Cutadapt output already exists for R1 and R2
Skipping RS-03984993 - Cutadapt output already exists for R1 and R2
Skipping RS-03984995 - Cutadapt output already exists for R1 and R2
Skipping RS-03984996 - Cutadapt output already exists for R1 and R2
Skipping RS-03984997 - Cutadapt output already exists for R1 and R2
Skipping RS-03984998 - Cutadapt output already exists for R1 and R2
Skipping RS-03984999 - Cutadapt output already exists for R1 and R2
Skipping RS-03985000 - Cutadapt output already exists for R1 and R2
Skipping RS-03985001 - Cutadapt output already exists for R1 and R2
Skipping RS-03985002 - Cutadapt output already exists for R1 and R2
Skipping RS-03985003 - Cutadapt output already exists for R1 and R2
Skipping RS-03985004 - Cutadapt output already exists for R1 and R2
Skipping RS-03985005 - Cutadapt output already exists for R1 and R2
Skipping RS-03985006 - Cutadapt output already exists for R1 and R2
Skipping RS-03985007 - Cutadapt output already exists for R1 and R2
Skipping RS-03985008 - Cutadapt output already exists for R1 and R2
Skipping RS-03985009 - Cutadapt output already exists for R1 and R2
Skipping RS-03984975 - Cutadapt output already exists for R1 and R2
Skipping RS-03984976 - Cutadapt output already exists for R1 and R2
Skipping RS-03984977 - Cutadapt output already exists for R1 and R2
Skipping RS-03984978 - Cutadapt output already exists for R1 and R2
Skipping RS-03984979 - Cutadapt output already exists for R1 and R2
Skipping RS-03985011 - Cutadapt output already exists for R1 and R2
Skipping RS-03985012 - Cutadapt output already exists for R1 and R2
Skipping RS-03985013 - Cutadapt output already exists for R1 and R2
Skipping RS-03985014 - Cutadapt output already exists for R1 and R2
Skipping RS-03985015 - Cutadapt output already exists for R1 and R2
Skipping RS-03985016 - Cutadapt output already exists for R1 and R2
Skipping RS-03985017 - Cutadapt output already exists for R1 and R2
Skipping RS-03985018 - Cutadapt output already exists for R1 and R2
Skipping RS-03985019 - Cutadapt output already exists for R1 and R2
Skipping RS-03985020 - Cutadapt output already exists for R1 and R2
Skipping RS-03985021 - Cutadapt output already exists for R1 and R2
Skipping RS-03985022 - Cutadapt output already exists for R1 and R2
Skipping RS-03985023 - Cutadapt output already exists for R1 and R2
Skipping RS-03985024 - Cutadapt output already exists for R1 and R2
Skipping RS-03985025 - Cutadapt output already exists for R1 and R2
Skipping RS-03985026 - Cutadapt output already exists for R1 and R2
Skipping RS-03985027 - Cutadapt output already exists for R1 and R2
Skipping RS-03985028 - Cutadapt output already exists for R1 and R2
Cutadapt for all R1 and R2 files completed.

Quality score trimming (Trimmomatic) ¶

InĀ [19]:
# Create a list to store the Trimmomatic subprocess instances
trimm_processes = []

# Iterate over the RS-* folders
for folder in folders:
    folder_name = os.path.basename(folder)

    r1_files = glob.glob(os.path.join(output_cut_dir, folder_name, "*R1_001.trimmed.fastq.gz"))[0]
    r2_files = glob.glob(os.path.join(output_cut_dir, folder_name, "*R2_001.trimmed.fastq.gz"))[0]

    if not r1_files or not r2_files:
        print(f"Skipping {os.path.basename(folder)} - R1 or R2 files not found.")
        continue

    # Create a subfolder for the current sample's FastQC output
    sample_trim_dir = os.path.join(output_trim_dir, folder_name)
    os.makedirs(sample_trim_dir, exist_ok=True)

    # Define input and output file paths
    r1_files_name = os.path.basename(r1_files).split('.')[0]
    r2_files_name = os.path.basename(r2_files).split('.')[0]
    r1_output_paired = os.path.join(sample_trim_dir, f"{r1_files_name}.trim_paired.fastq.gz")
    r2_output_paired = os.path.join(sample_trim_dir, f"{r2_files_name}.trim_paired.fastq.gz")
    r1_output_unpaired = os.path.join(sample_trim_dir, f"{r1_files_name}.trim_unpaired.fastq.gz")
    r2_output_unpaired = os.path.join(sample_trim_dir, f"{r2_files_name}.trim_unpaired.fastq.gz")

    # Check if Trimmomatic output already exists for R1 and R2 files
    if os.path.exists(r1_output_paired) and os.path.exists(r2_output_paired) and \
        os.path.exists(r1_output_unpaired) and os.path.exists(r2_output_unpaired):
        print(f"Skipping {folder_name} - Trimmomatic output already exists for R1 and R2")
    else:
        # Create a log directory inside the output_trim_dir
        log_directory = os.path.join(output_trim_dir, "trimmomatic_logs")
        os.makedirs(log_directory, exist_ok=True)
        
        # Define log file path for stdout
        stdout_log_path = os.path.join(log_directory, f"{folder_name}_stdout.log")

        # Run TrimmomaticPE for R1 and R2 files
        command = [
            "TrimmomaticPE",
            "-trimlog", stdout_log_path,
            r1_files, r2_files,
            r1_output_paired, r1_output_unpaired,
            r2_output_paired, r2_output_unpaired,
            "LEADING:20",
            "TRAILING:20",
            "SLIDINGWINDOW:4:15",
            "MINLEN:25"
        ]

        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        trimm_processes.append(process)

# Wait for all subprocesses to finish
for process in trimm_processes:
    _, stderr_output = process.communicate()
    if process.returncode != 0:
        print(f"An error occurred: {stderr_output.decode('utf-8')}")

print("Trimmomatic for all R1 and R2 files completed.")
Skipping RS-03984980 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985010 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985029 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985030 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985031 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985032 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985033 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985034 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985035 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985036 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985037 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985038 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985039 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985040 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985041 - Trimmomatic output already exists for R1 and R2
Skipping RS-04068630 - Trimmomatic output already exists for R1 and R2
Skipping RS-04068631 - Trimmomatic output already exists for R1 and R2
Skipping RS-04068632 - Trimmomatic output already exists for R1 and R2
Skipping RS-04068633 - Trimmomatic output already exists for R1 and R2
Skipping RS-04068634 - Trimmomatic output already exists for R1 and R2
Skipping RS-04068635 - Trimmomatic output already exists for R1 and R2
Skipping RS-04068636 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984981 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984982 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984983 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984984 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984985 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984986 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984987 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984988 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984989 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984990 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984991 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984992 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984993 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984995 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984996 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984997 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984998 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984999 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985000 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985001 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985002 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985003 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985004 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985005 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985006 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985007 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985008 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985009 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984975 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984976 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984977 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984978 - Trimmomatic output already exists for R1 and R2
Skipping RS-03984979 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985011 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985012 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985013 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985014 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985015 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985016 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985017 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985018 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985019 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985020 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985021 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985022 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985023 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985024 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985025 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985026 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985027 - Trimmomatic output already exists for R1 and R2
Skipping RS-03985028 - Trimmomatic output already exists for R1 and R2
Trimmomatic for all R1 and R2 files completed.

FASTQ – Post QC ¶

InĀ [20]:
# Create a list to store the FastQC subprocess instances
postfastqc_processes = []

# Specify the folder for FastQC output
folder_fastqc_dir = os.path.join(base_dir, f"{directory}_Post_FASTQC")

# Create the FastQC output directory if it doesn't exist
os.makedirs(folder_fastqc_dir, exist_ok=True)

# Iterate over the RS-* folders
for folder in folders:
    folder_name = os.path.basename(folder)
    r1_files = glob.glob(os.path.join(output_trim_dir, folder_name, "*R1_001.trim_paired.fastq.gz"))[0]
    r2_files = glob.glob(os.path.join(output_trim_dir, folder_name, "*R2_001.trim_paired.fastq.gz"))[0]

    if not r1_files or not r2_files:
        print(f"Skipping {os.path.basename(folder)} - R1 or R2 files not found.")
        continue

    # Create a subfolder for the current sample's FastQC output
    sample_fastqc_dir = os.path.join(folder_fastqc_dir, folder_name)
    os.makedirs(sample_fastqc_dir, exist_ok=True)

    r1_files_name = os.path.basename(r1_files).split('.')[0]
    r2_files_name = os.path.basename(r2_files).split('.')[0]

    r1_exist = os.path.join(sample_fastqc_dir, f"{os.path.basename(r1_files_name)}.trim_paired_fastqc.html")
    r2_exist = os.path.join(sample_fastqc_dir, f"{os.path.basename(r2_files_name)}.trim_paired_fastqc.html")

    # Check if FastQC output already exists for R1 file
    if os.path.exists(r1_exist):
        print(f"Skipping {folder_name} - FastQC output already exists for R1")
    else:
        # Run FastQC for R1 file
        command_r1 = [
            "fastqc",
            "-f", "fastq",
            "--extract",
            "-t", "9",
            "-o", sample_fastqc_dir,
            r1_files
        ]
        process_r1 = subprocess.Popen(command_r1, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        postfastqc_processes.append(process_r1)

    # Check if FastQC output already exists for R2 file
    if os.path.exists(r2_exist):
        print(f"Skipping {folder_name} - FastQC output already exists for R2")
    else:
        # Run FastQC for R2 file
        command_r2 = [
            "fastqc",
            "-f", "fastq",
            "--extract",
            "-t", "9",
            "-o", sample_fastqc_dir,
            r2_files
        ]
        process_r2 = subprocess.Popen(command_r2, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        postfastqc_processes.append(process_r2)

# Wait for all subprocesses to finish and re-run ones with errors
for process in postfastqc_processes:
    _, stderr_output = process.communicate()
    if process.returncode != 0:
        print(f"Error message: {stderr_output.decode('utf-8')}")
        
        # Re-run the same FastQC command
        re_run_process = subprocess.Popen(
            process.args,  # Re-run the same command
            stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        _, re_run_stderr = re_run_process.communicate()
        if re_run_process.returncode != 0:
            print("Failed to re-run FastQC.")

print("FastQC for all R1 and R2 files completed.")
Skipping RS-03984980 - FastQC output already exists for R1
Skipping RS-03984980 - FastQC output already exists for R2
Skipping RS-03985010 - FastQC output already exists for R1
Skipping RS-03985010 - FastQC output already exists for R2
Skipping RS-03985029 - FastQC output already exists for R1
Skipping RS-03985029 - FastQC output already exists for R2
Skipping RS-03985030 - FastQC output already exists for R1
Skipping RS-03985030 - FastQC output already exists for R2
Skipping RS-03985031 - FastQC output already exists for R1
Skipping RS-03985031 - FastQC output already exists for R2
Skipping RS-03985032 - FastQC output already exists for R1
Skipping RS-03985032 - FastQC output already exists for R2
Skipping RS-03985033 - FastQC output already exists for R1
Skipping RS-03985033 - FastQC output already exists for R2
Skipping RS-03985034 - FastQC output already exists for R1
Skipping RS-03985034 - FastQC output already exists for R2
Skipping RS-03985035 - FastQC output already exists for R1
Skipping RS-03985035 - FastQC output already exists for R2
Skipping RS-03985036 - FastQC output already exists for R1
Skipping RS-03985036 - FastQC output already exists for R2
Skipping RS-03985037 - FastQC output already exists for R1
Skipping RS-03985037 - FastQC output already exists for R2
Skipping RS-03985038 - FastQC output already exists for R1
Skipping RS-03985038 - FastQC output already exists for R2
Skipping RS-03985039 - FastQC output already exists for R1
Skipping RS-03985039 - FastQC output already exists for R2
Skipping RS-03985040 - FastQC output already exists for R1
Skipping RS-03985040 - FastQC output already exists for R2
Skipping RS-03985041 - FastQC output already exists for R1
Skipping RS-03985041 - FastQC output already exists for R2
Skipping RS-04068630 - FastQC output already exists for R1
Skipping RS-04068630 - FastQC output already exists for R2
Skipping RS-04068631 - FastQC output already exists for R1
Skipping RS-04068631 - FastQC output already exists for R2
Skipping RS-04068632 - FastQC output already exists for R1
Skipping RS-04068632 - FastQC output already exists for R2
Skipping RS-04068633 - FastQC output already exists for R1
Skipping RS-04068633 - FastQC output already exists for R2
Skipping RS-04068634 - FastQC output already exists for R1
Skipping RS-04068634 - FastQC output already exists for R2
Skipping RS-04068635 - FastQC output already exists for R1
Skipping RS-04068635 - FastQC output already exists for R2
Skipping RS-04068636 - FastQC output already exists for R1
Skipping RS-04068636 - FastQC output already exists for R2
Skipping RS-03984981 - FastQC output already exists for R1
Skipping RS-03984981 - FastQC output already exists for R2
Skipping RS-03984982 - FastQC output already exists for R1
Skipping RS-03984982 - FastQC output already exists for R2
Skipping RS-03984983 - FastQC output already exists for R1
Skipping RS-03984983 - FastQC output already exists for R2
Skipping RS-03984984 - FastQC output already exists for R1
Skipping RS-03984984 - FastQC output already exists for R2
Skipping RS-03984985 - FastQC output already exists for R1
Skipping RS-03984985 - FastQC output already exists for R2
Skipping RS-03984986 - FastQC output already exists for R1
Skipping RS-03984986 - FastQC output already exists for R2
Skipping RS-03984987 - FastQC output already exists for R1
Skipping RS-03984987 - FastQC output already exists for R2
Skipping RS-03984988 - FastQC output already exists for R1
Skipping RS-03984988 - FastQC output already exists for R2
Skipping RS-03984989 - FastQC output already exists for R1
Skipping RS-03984989 - FastQC output already exists for R2
Skipping RS-03984990 - FastQC output already exists for R1
Skipping RS-03984990 - FastQC output already exists for R2
Skipping RS-03984991 - FastQC output already exists for R1
Skipping RS-03984991 - FastQC output already exists for R2
Skipping RS-03984992 - FastQC output already exists for R1
Skipping RS-03984992 - FastQC output already exists for R2
Skipping RS-03984993 - FastQC output already exists for R1
Skipping RS-03984993 - FastQC output already exists for R2
Skipping RS-03984995 - FastQC output already exists for R1
Skipping RS-03984995 - FastQC output already exists for R2
Skipping RS-03984996 - FastQC output already exists for R1
Skipping RS-03984996 - FastQC output already exists for R2
Skipping RS-03984997 - FastQC output already exists for R1
Skipping RS-03984997 - FastQC output already exists for R2
Skipping RS-03984998 - FastQC output already exists for R1
Skipping RS-03984998 - FastQC output already exists for R2
Skipping RS-03984999 - FastQC output already exists for R1
Skipping RS-03984999 - FastQC output already exists for R2
Skipping RS-03985000 - FastQC output already exists for R1
Skipping RS-03985000 - FastQC output already exists for R2
Skipping RS-03985001 - FastQC output already exists for R1
Skipping RS-03985001 - FastQC output already exists for R2
Skipping RS-03985002 - FastQC output already exists for R1
Skipping RS-03985002 - FastQC output already exists for R2
Skipping RS-03985003 - FastQC output already exists for R1
Skipping RS-03985003 - FastQC output already exists for R2
Skipping RS-03985004 - FastQC output already exists for R1
Skipping RS-03985004 - FastQC output already exists for R2
Skipping RS-03985005 - FastQC output already exists for R1
Skipping RS-03985005 - FastQC output already exists for R2
Skipping RS-03985006 - FastQC output already exists for R1
Skipping RS-03985006 - FastQC output already exists for R2
Skipping RS-03985007 - FastQC output already exists for R1
Skipping RS-03985007 - FastQC output already exists for R2
Skipping RS-03985008 - FastQC output already exists for R1
Skipping RS-03985008 - FastQC output already exists for R2
Skipping RS-03985009 - FastQC output already exists for R1
Skipping RS-03985009 - FastQC output already exists for R2
Skipping RS-03984975 - FastQC output already exists for R1
Skipping RS-03984975 - FastQC output already exists for R2
Skipping RS-03984976 - FastQC output already exists for R1
Skipping RS-03984976 - FastQC output already exists for R2
Skipping RS-03984977 - FastQC output already exists for R1
Skipping RS-03984977 - FastQC output already exists for R2
Skipping RS-03984978 - FastQC output already exists for R1
Skipping RS-03984978 - FastQC output already exists for R2
Skipping RS-03984979 - FastQC output already exists for R1
Skipping RS-03984979 - FastQC output already exists for R2
Skipping RS-03985011 - FastQC output already exists for R1
Skipping RS-03985011 - FastQC output already exists for R2
Skipping RS-03985012 - FastQC output already exists for R1
Skipping RS-03985012 - FastQC output already exists for R2
Skipping RS-03985013 - FastQC output already exists for R1
Skipping RS-03985013 - FastQC output already exists for R2
Skipping RS-03985014 - FastQC output already exists for R1
Skipping RS-03985014 - FastQC output already exists for R2
Skipping RS-03985015 - FastQC output already exists for R1
Skipping RS-03985015 - FastQC output already exists for R2
Skipping RS-03985016 - FastQC output already exists for R1
Skipping RS-03985016 - FastQC output already exists for R2
Skipping RS-03985017 - FastQC output already exists for R1
Skipping RS-03985017 - FastQC output already exists for R2
Skipping RS-03985018 - FastQC output already exists for R1
Skipping RS-03985018 - FastQC output already exists for R2
Skipping RS-03985019 - FastQC output already exists for R1
Skipping RS-03985019 - FastQC output already exists for R2
Skipping RS-03985020 - FastQC output already exists for R1
Skipping RS-03985020 - FastQC output already exists for R2
Skipping RS-03985021 - FastQC output already exists for R1
Skipping RS-03985021 - FastQC output already exists for R2
Skipping RS-03985022 - FastQC output already exists for R1
Skipping RS-03985022 - FastQC output already exists for R2
Skipping RS-03985023 - FastQC output already exists for R1
Skipping RS-03985023 - FastQC output already exists for R2
Skipping RS-03985024 - FastQC output already exists for R1
Skipping RS-03985024 - FastQC output already exists for R2
Skipping RS-03985025 - FastQC output already exists for R1
Skipping RS-03985025 - FastQC output already exists for R2
Skipping RS-03985026 - FastQC output already exists for R1
Skipping RS-03985026 - FastQC output already exists for R2
Skipping RS-03985027 - FastQC output already exists for R1
Skipping RS-03985027 - FastQC output already exists for R2
Skipping RS-03985028 - FastQC output already exists for R1
Skipping RS-03985028 - FastQC output already exists for R2
FastQC for all R1 and R2 files completed.

Alignment ¶

Parallel alignment (Bowtie2) ¶

InĀ [21]:
# Create a list to store the subprocess instances
processes = []

# Iterate over the RS-* folders
for folder in folders:
    folder_name = os.path.basename(folder)

    r1_files = glob.glob(os.path.join(output_trim_dir, folder_name, "*R1_001.trim_paired.fastq.gz"))[0]
    r2_files = glob.glob(os.path.join(output_trim_dir, folder_name, "*R2_001.trim_paired.fastq.gz"))[0]

    folder_name = os.path.basename(folder)
    output_sam = os.path.join(output_sam_dir, folder_name + ".sam")
    output_bam = os.path.join(output_bam_dir, folder_name + ".bam")
    sam_compress = os.path.join(base_dir, output_sam_dir, f"{directory}_sam_compressed.tar.gz")
    bam_compress = os.path.join(base_dir, output_bam_dir, f"{directory}_bam_compressed.tar.gz")

    # Check if SAM file already exists in the output directory
    if os.path.exists(output_sam) or os.path.exists(output_bam) or os.path.exists(sam_compress) or os.path.exists(bam_compress):
        print(f"Skipping {folder_name} - file already exists")
        continue

    command = [
        "bowtie2",
        "-x", reference_index_dir,
        "-1", r1_files,
        "-2", r2_files,
        "-p", "9",
        "-S", output_sam,
        "--local",
        "--very-sensitive-local",
        "-I", "10",
        "-X", "700",
        "--dovetail",
        "--no-unal",
        "--no-mixed"
    ]

    # Create a log file for subprocess output
    log_file = os.path.join(output_sam_dir, f"{folder_name}_output.log")
    log_file_handle = open(log_file, "w")

    # Run the command in the background using subprocess.Popen
    process = subprocess.Popen(command, stdout=log_file_handle, stderr=subprocess.STDOUT)

    # Add the process and log file handle to the list
    processes.append((process, log_file_handle))

# Monitor the real-time output of subprocesses
while processes:
    finished_processes = []
    for process, log_file_handle in processes:
        return_code = process.poll()

        if return_code is None:
            # Process is still running
            # Close the log file handle
            log_file_handle.close()

            # Reopen the log file in read mode
            log_file_handle = open(log_file, "r")

            # Read and process the latest output
            output_lines = log_file_handle.readlines()

            # Check if stdout is redirected correctly
            if output_lines:
                # Process and display the output
                for line in output_lines:
                    print(line.strip())
                # Optionally, you can update a progress indicator here
            else:
                # Skip printing output if stdout is correctly redirected
                continue

        else:
            # Process has finished
            finished_processes.append((process, log_file_handle))

    # Remove finished processes from the list
    for process, log_file_handle in finished_processes:
        processes.remove((process, log_file_handle))
        log_file_handle.close()

    # Wait before checking the subprocesses again
    time.sleep(1)

# Get the list of log files in folder.sam directory
log_files = glob.glob(os.path.join(output_sam_dir, "*_output.log"))
Skipping RS-03984980 - file already exists
Skipping RS-03985010 - file already exists
Skipping RS-03985029 - file already exists
Skipping RS-03985030 - file already exists
Skipping RS-03985031 - file already exists
Skipping RS-03985032 - file already exists
Skipping RS-03985033 - file already exists
Skipping RS-03985034 - file already exists
Skipping RS-03985035 - file already exists
Skipping RS-03985036 - file already exists
Skipping RS-03985037 - file already exists
Skipping RS-03985038 - file already exists
Skipping RS-03985039 - file already exists
Skipping RS-03985040 - file already exists
Skipping RS-03985041 - file already exists
Skipping RS-04068630 - file already exists
Skipping RS-04068631 - file already exists
Skipping RS-04068632 - file already exists
Skipping RS-04068633 - file already exists
Skipping RS-04068634 - file already exists
Skipping RS-04068635 - file already exists
Skipping RS-04068636 - file already exists
Skipping RS-03984981 - file already exists
Skipping RS-03984982 - file already exists
Skipping RS-03984983 - file already exists
Skipping RS-03984984 - file already exists
Skipping RS-03984985 - file already exists
Skipping RS-03984986 - file already exists
Skipping RS-03984987 - file already exists
Skipping RS-03984988 - file already exists
Skipping RS-03984989 - file already exists
Skipping RS-03984990 - file already exists
Skipping RS-03984991 - file already exists
Skipping RS-03984992 - file already exists
Skipping RS-03984993 - file already exists
Skipping RS-03984995 - file already exists
Skipping RS-03984996 - file already exists
Skipping RS-03984997 - file already exists
Skipping RS-03984998 - file already exists
Skipping RS-03984999 - file already exists
Skipping RS-03985000 - file already exists
Skipping RS-03985001 - file already exists
Skipping RS-03985002 - file already exists
Skipping RS-03985003 - file already exists
Skipping RS-03985004 - file already exists
Skipping RS-03985005 - file already exists
Skipping RS-03985006 - file already exists
Skipping RS-03985007 - file already exists
Skipping RS-03985008 - file already exists
Skipping RS-03985009 - file already exists
Skipping RS-03984975 - file already exists
Skipping RS-03984976 - file already exists
Skipping RS-03984977 - file already exists
Skipping RS-03984978 - file already exists
Skipping RS-03984979 - file already exists
Skipping RS-03985011 - file already exists
Skipping RS-03985012 - file already exists
Skipping RS-03985013 - file already exists
Skipping RS-03985014 - file already exists
Skipping RS-03985015 - file already exists
Skipping RS-03985016 - file already exists
Skipping RS-03985017 - file already exists
Skipping RS-03985018 - file already exists
Skipping RS-03985019 - file already exists
Skipping RS-03985020 - file already exists
Skipping RS-03985021 - file already exists
Skipping RS-03985022 - file already exists
Skipping RS-03985023 - file already exists
Skipping RS-03985024 - file already exists
Skipping RS-03985025 - file already exists
Skipping RS-03985026 - file already exists
Skipping RS-03985027 - file already exists
Skipping RS-03985028 - file already exists

Post-QC MultiQC analysis ¶

InĀ [22]:
# Specify the folder and file name for the MultiQC report
multiqc_output_dir = os.path.join(base_dir, f"{directory}_Post-QC_MultiQC")
os.makedirs(multiqc_output_dir, exist_ok=True)
multiqc_report_file = os.path.join(multiqc_output_dir, f"{directory}_Post-QC_multiqc_report.html")

# Generate the MultiQC report with --interactive flag
if os.path.exists(multiqc_report_file):
    print("Skipping MultiQC - Report file already exists.")
else:
    # Generate the MultiQC report with --interactive flag
    multiqc_command = [
        "multiqc",
        "--interactive",
        base_dir,
        "-o", multiqc_output_dir,
        "--filename", f"{directory}_Post-QC_multiqc_report.html"
    ]
    subprocess.run(multiqc_command, check=True)

print("MultiQC report generation completed.")

# # Display the MultiQC report in the notebook
# display(HTML(filename=os.path.join(PATH, "multiqc_report.html")))
Skipping MultiQC - Report file already exists.
MultiQC report generation completed.

Merging alignment logs ¶

InĀ [23]:
# Create an empty list to store the data
data = []

# Process each log file
for log_file_path in log_files:
    log_file_name = os.path.basename(log_file_path)
    filename = os.path.splitext(log_file_name)[0]
    filename = filename.split("_")[0]

    totalread = alignrate = concordant = multiple = discordant = None  # Initialize the variables

    with open(log_file_path) as f:
        for line in f.readlines():
            line = line.strip()
            if 'reads; of these:' in line:
                totalread = line.split(' ')[0]
            elif 'overall alignment rate' in line:
                alignrate = line.split(' ')[0]
            elif 'aligned concordantly exactly 1 time' in line:
                concordant = line.split(' ')[0]
            elif 'aligned concordantly >1 times' in line:
                multiple = line.split(' ')[0]
            elif 'aligned discordantly 1 time' in line:
                discordant = line.split(' ')[0]
        data.append([filename, totalread, alignrate, concordant, multiple, discordant])

# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['Filename', 'TotalRead', 'AlignRate', 'Concordant', 'Multiple', 'Discordant'])
# Write the DataFrame to a CSV file
output_path = os.path.join(output_sam_dir, f'{directory}_summary.csv')
df.to_csv(output_path, index=False)

Alignment summary ¶

InĀ [24]:
# Read the table from XLSX
data_summary = pd.read_csv(f"{output_path}").sort_values(by='Filename')

# Replace Filename values with sample_key.values() if matched with sample_key.keys()
for key, values in sample_key.items():
    data_summary.loc[data_summary["Filename"] == key, "Filename"] = values[0]

# Remove the percentage sign from AlignRate and convert to numeric format
data_summary['AlignRate'] = data_summary['AlignRate'].str.rstrip('%').astype(float)

# Create subplots with adjustable figure size
fig, axes = plt.subplots(3, 1, figsize=(len(data_summary) * 3, len(data_summary) * 2))

# Use dedicated color for samples (if not in drug_color_map, then assign random colors)
sample_color_list = {filename: drug_color_map[filename.split("-")[1]] for filename in data_summary['Filename']}

# Bar plot of TotalRead
sns.barplot(ax=axes[0], x='Filename', y='TotalRead', data=data_summary, palette=sample_color_list)
axes[0].set_xlabel('Samples')
axes[0].set_ylabel('TotalRead')
axes[0].set_title('Samples TotalRead', fontsize=(len(data_summary) * 2))  # Adjust the title font size
axes[0].tick_params(axis='both', labelsize=(len(data_summary) * 1)) # Adjust the X and Y axis tick font size

# Add data labels to the middle of the bars
for p in axes[0].patches:
    axes[0].annotate(f'{p.get_height():,.0f}', (p.get_x() + p.get_width() / 2, p.get_height()),
                     ha='center', va='center', xytext=(0, 5), textcoords='offset points', fontsize=(len(data_summary) * 1), rotation=90)

# Grouped bar plot of Concordant, Multiple, and Discordant
axes[1].set_ylim(0, max(data_summary['TotalRead']))
axes[1].bar(data_summary['Filename'], data_summary['Concordant'], label='Concordant', color='#C0C0C0')
axes[1].bar(data_summary['Filename'], data_summary['Multiple'], label='Multiple',
            bottom=data_summary['Concordant'], color='#808080')
axes[1].bar(data_summary['Filename'], data_summary['Discordant'], label='Discordant',
            bottom=data_summary['Concordant'] + data_summary['Multiple'], color='#DCDCDC')
axes[1].set_xlabel('Samples')
axes[1].set_ylabel('Count')
axes[1].set_title('Concordant, Multiple, and Discordant Counts', fontsize=(len(data_summary) * 2))  # Adjust the title font size
axes[1].tick_params(axis='both', labelsize=(len(data_summary) * 1)) # Adjust the X and Y axis tick font size
axes[1].legend()

# Add data labels to the bars
for p in axes[1].patches:
    width = p.get_width()
    height = p.get_height()
    x = p.get_x()
    y = p.get_y()

    if p.get_label() == 'Discordant':
        label = f'{height / 1000:,.0f}'  # Format the value in thousands
        axes[1].annotate(f'{p.get_height():,.0f}', (p.get_x() + p.get_width() / 2, p.get_height()),
                         ha='center', va='center', xytext=(0, 20), textcoords='offset points', fontsize=(len(data_summary) * 1))
    else:
        label = f'{height / 1000:,.0f}'  # Format the value in thousands
        axes[1].annotate(label, (x + width / 2, y + height / 2),
                         ha='center', va='center', fontsize=(len(data_summary) * 1))

# Bar plot of AlignRate
sns.barplot(ax=axes[2], x='Filename', y='AlignRate', data=data_summary, palette=sample_color_list)
axes[2].set_xlabel('Samples')
axes[2].set_ylabel('AlignRate')
axes[2].set_title('Samples AlignRate', fontsize=(len(data_summary) * 2))  # Adjust the title font size
axes[2].tick_params(axis='both', labelsize=(len(data_summary) * 1)) # Adjust the X and Y axis tick font size
axes[2].set_ylim(0, 100)  # Set y-axis range from 0 to 100%

# Add data labels to the middle of the bars for AlignRate in percentage format
for p in axes[2].patches:
    axes[2].annotate(f'{p.get_height()}%', (p.get_x() + p.get_width() / 2, p.get_height()),
                     ha='center', va='center', xytext=(0, 5), textcoords='offset points', fontsize=(len(data_summary) * 1), rotation=90)

# Rotate x-axis labels for all subplots
for ax in axes:
    ax.tick_params(axis='x', labelrotation=90, size=(len(data_summary) * 3))

# Step 2: Adjust x-axis limits for axes 1
axes[1].set_xlim(-0.5, len(data_summary['Filename']) - 0.5)

# # Adjust spacing between subplots
# plt.tight_layout()

# Define the Read summary graph file path for graph storing
read_summary_path = os.path.join(graphs_files, f"{directory}_Read_Summary.svg")
plt.savefig(read_summary_path, format='svg', bbox_inches='tight', dpi=300)  # Use bbox_inches='tight' to include all elements
print(f"{directory}_Read_Summary.svg saved to {read_summary_path}")

# plt.tight_layout()
plt.close()
# plt.show()
RQ023682_Read_Summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/RQ023682_Read_Summary.svg

Post-Alignment ¶

SAM files to BAM files (samtools) ¶

InĀ [25]:
# Get the list of sam files in folder.sam directory
sam_files = glob.glob(os.path.join(output_sam_dir, "*.sam"))

# Create the output .csv directory
output_csv = os.path.join(base_dir, directory + "_rc.csv")
os.makedirs(output_csv, exist_ok=True)

# Get the list of bam files in folder.bam directory
bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam"))

# Lock for synchronization
convert_lock = threading.Lock()

def convert_sam_to_bam(sam_file):
    with convert_lock:
        bam_file = os.path.join(output_bam_dir, os.path.basename(sam_file).replace(".sam", ".bam"))
        if not os.path.exists(bam_file):
            pysam.view("-bhS", "-o", bam_file, sam_file, catch_stdout=False)
            print(f"Conversion completed for {sam_file} - BAM file: {bam_file}")
            return bam_file

# Use ProcessPoolExecutor to run conversions in parallel
with ProcessPoolExecutor(max_workers=None) as executor:
    bam_files = list(executor.map(convert_sam_to_bam, sam_files))

print("All conversions complete.")
All conversions complete.

Processing BAM files (samtools) ¶

InĀ [26]:
def run_subprocess(args, step_name, log_file, bam_file):
    process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    try:
        stdout, stderr = process.communicate()
        return_code = process.returncode

        log_file.write(f"Step {step_name} - STDOUT:\n")
        log_file.write(stdout)
        log_file.write("\n")
        log_file.write(f"Step {step_name} - STDERR:\n")
        log_file.write(stderr)
        log_file.write("\n")

        # Check for expected merge message in stderr
        expected_merge_message = "[bam_sort_core] merging from"
        if expected_merge_message in stderr:
            # print(f"Expected merge message found in {step_name} for {bam_file}")
            return_code = 0  # Consider it as a success
        
        if return_code != 0:
            log_file.write(f"Step {step_name} - Error Return Code: {return_code}\n")
            log_file.write(f"Error in {step_name} - {stderr}\n")
            print(f"Error in {step_name} for {bam_file}")
        
        return return_code
    except:
        process.terminate()
        raise

MAX_RETRY_COUNT = 1

def process_bam(bam_file, log_dir):
    bam_name = os.path.basename(bam_file)
    bam_name_no_extension = os.path.splitext(bam_name)[0]
    log_file_path = os.path.join(log_dir, f"{bam_name_no_extension}_log.txt")

    # Define the output_processed_bam_dir
    output_processed_bam_dir = os.path.join(base_dir, directory + "_processed_bam")

    def run_subprocess_with_retries(args, step_name, log_file):
        for retry in range(MAX_RETRY_COUNT):
            return_code = run_subprocess(args, step_name, log_file)
            if return_code == 0:
                break
            else:
                print(f"Error in {step_name} for {bam_file}, Retry {retry+1}/{MAX_RETRY_COUNT}")
        
        if return_code != 0:
            print(f"Max retry reached for {step_name} - {bam_file}")
        
        return return_code

    def log_step_output(step_name, return_code):
        with open(log_file_path, "a") as log_file:
            log_file.write(f"Step {step_name} - Return Code: {return_code}\n")

    # Step 1: Remove unmapped reads
    unmapped_removed_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_unmapped_removed.bam")
    with open(log_file_path, "a") as log_file:
        step_name = "1 - Remove unmapped reads"
        return_code = run_subprocess(
            ["samtools", "view", "-@9", "-h", "-F", "4", "-b", "-o", unmapped_removed_bam_file, bam_file],
            step_name, log_file, bam_file
        )
        log_step_output(step_name, return_code)
        if return_code != 0:
            print(f"Error in {step_name} for {bam_file}")
            return return_code

    # Step 2: Sort by name
    sorted_name_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_sorted_name.bam")
    with open(log_file_path, "a") as log_file:
        step_name = "2 - Sort by name"
        return_code = run_subprocess(
            ["samtools", "sort", "-@9", "-m", "2G", "-n", "-o", sorted_name_bam_file, unmapped_removed_bam_file],
            step_name, log_file, bam_file
        )
        if return_code != 0:
            print(f"Error in {step_name} for {bam_file}")
            return

    # Step 3: Add/correct mate pair information
    fixmate_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_fixmate.bam")
    with open(log_file_path, "a") as log_file:
        step_name = "3 - Add/correct mate pair information"
        return_code = run_subprocess(
            ["samtools", "fixmate", "-@9", "-m", sorted_name_bam_file, fixmate_bam_file],
            step_name, log_file, bam_file
        )
        log_step_output(step_name, return_code)
        if return_code != 0:
            print(f"Error in {step_name} for {bam_file}")
            return

    # Step 4: Sort by genome coordinate
    sorted_coord_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_sorted_coord.bam")
    with open(log_file_path, "a") as log_file:
        step_name = "4 - Sort by genome coordinate"
        return_code = run_subprocess(
            ["samtools", "sort", "-@9", "-m", "2G", "-o", sorted_coord_bam_file, fixmate_bam_file],
            step_name, log_file, bam_file
        )
        log_step_output(step_name, return_code)
        if return_code != 0:
            print(f"Error in {step_name} for {bam_file}")
            return

    # Step 5: Mark duplicates
    marked_duplicates_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_processed.bam")
    with open(log_file_path, "a") as log_file:
        step_name = "5 - Mark duplicates"
        return_code = run_subprocess(
            ["samtools", "markdup", "-@9", sorted_coord_bam_file, marked_duplicates_bam_file],
            step_name, log_file, bam_file
        )
        log_step_output(step_name, return_code)
        if return_code != 0:
            print(f"Error in {step_name} for {bam_file}")
            return

    # Step 6: Index the aligned BAM file
    index_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}.bai")
    with open(log_file_path, "a") as log_file:
        step_name = "6 - Index the aligned BAM file"
        return_code = run_subprocess(
            ["samtools", "index", marked_duplicates_bam_file],
            step_name, log_file, bam_file
        )
        log_step_output(step_name, return_code)
        if return_code != 0:
            print(f"Error in {step_name} for {bam_file}")
            return

    # Clean up intermediate files from previous steps
    os.remove(unmapped_removed_bam_file)
    os.remove(sorted_name_bam_file)
    os.remove(fixmate_bam_file)
    os.remove(sorted_coord_bam_file)

    # print(f"Processing complete for {bam_file}")

def main():
    bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam"))
    output_processed_bam_dir = os.path.join(base_dir, directory + "_processed_bam")
    log_dir = os.path.join(output_processed_bam_dir, "log")  # Define and initialize log_dir
    os.makedirs(log_dir, exist_ok=True)
    errored_files = []

    with concurrent.futures.ProcessPoolExecutor() as executor:
        for return_code, bam_file in zip(executor.map(process_bam, bam_files, [log_dir] * len(bam_files)), bam_files):
            if return_code != 0:
                errored_files.append(bam_file)

    # # Retry processing for errored files
    # for bam_file in errored_files:
    #     print(f"Retrying processing for {bam_file}")
    #     process_bam(bam_file, log_dir)

    print("All processes complete.")

if __name__ == "__main__":
    try:
        # Check if processed BAM files already exist and skip processing
        bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam"))
        output_processed_bam_dir = os.path.join(base_dir, directory + "_processed_bam")

        processed_bam_files = [os.path.join(output_processed_bam_dir, f"{os.path.splitext(os.path.basename(bam))[0]}_processed.bam")
                               for bam in bam_files]

        bam_files_to_process = [bam_file for bam_file, processed_bam_file in zip(bam_files, processed_bam_files)
                                if not os.path.exists(processed_bam_file)]

        if bam_files_to_process:
            print(f"Processing {len(bam_files_to_process)} BAM files.")
            main()  # Call the main processing function
        else:
            print("All BAM files are already processed. Skipping.")

    except KeyboardInterrupt:
        print("Execution interrupted.")
All BAM files are already processed. Skipping.

Collect read count from processed bam files ¶

InĀ [27]:
# Define processing_bam_dir variables
processing_bam_dir = os.path.join(base_dir, directory + "_processed_bam")
processing_bam = glob.glob(os.path.join(processing_bam_dir, "*_processed.bam"))

# Lock for synchronization
collect_lock = threading.Lock()

def collect_read_counts_and_save(bam_file):
    mapq_threshold = 30
    read_count = {}
    
    with pysam.AlignmentFile(bam_file, "rb") as sorted_bam:
        for read in sorted_bam:
            if (
                # read.flag == 0  
                # read.is_proper_pair  # Check if the read is part of a proper pair
                # and not read.mate_is_unmapped  # Check if the mate is mapped
                read.mapping_quality >= mapq_threshold
            ):
                reference_name = sorted_bam.get_reference_name(read.reference_id)
                if reference_name != "*" and not read.is_unmapped:
                    read_count[reference_name] = read_count.get(reference_name, 0) + 1

    # Construct paths for output BAM and _rc.csv files
    bam_name = os.path.basename(bam_file)
    bam_name_no_extension = os.path.splitext(bam_name)[0]
    bam_name_no_extension = bam_name_no_extension.replace("_processed", "")
    output_csv_file = os.path.join(output_csv, f"{bam_name_no_extension}_rc.csv")

    # Save the read count data to CSV file
    with collect_lock:
        with open(output_csv_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['ID', bam_name_no_extension])
            for key, value in read_count.items():
                writer.writerow([key, value])

        # print(f"Processed {bam_file} and saved the result to {output_csv_file}")

# Use ProcessPoolExecutor to run read count collection and saving in parallel
with ProcessPoolExecutor(max_workers=None) as executor:
    executor.map(collect_read_counts_and_save, processing_bam)

print("All read count collection and saving processes complete.")
All read count collection and saving processes complete.

Merging samples read counts ¶

InĀ [28]:
# In case, wanna run only this part
output_csv = os.path.join(base_dir, directory+ "_rc.csv")

# Retrieving fasta_index file
merged_data = pd.read_csv(fasta_file)

# Get the list of CSV files in the folder
csv_files = glob.glob(os.path.join(output_csv, "*.csv"))

# Sort the CSV files list in ascending order
csv_files.sort()

# Process each CSV file
for csv_file in csv_files:
    sam_file_name = os.path.splitext(os.path.basename(csv_file))[0]
    sam_file_name = sam_file_name.split('_')[0]
    df = pd.read_csv(csv_file)
    df = df.rename(columns={"Read count": sam_file_name})
    merged_data = pd.merge(merged_data, df, on=['ID'], how='outer')

# Sort the merged data by ID in reverse order
merged_data = merged_data.sort_values(by='ORF_ID')

# Remove the 'ID' column
merged_data = merged_data.drop('ID', axis=1)

# Rename the headers based on the sample_key dictionary
for key, value in sample_key.items():
    if key in merged_data.columns:
        new_header = '_'.join(value) if isinstance(value, list) else value
        merged_data = merged_data.rename(columns={key: new_header})

# Define the output file path for the merged XLSX
merged_xlsx_path = os.path.join(database_files, f"{directory}_merged.xlsx")

# Save the merged data to an XLSX file
merged_data.to_excel(merged_xlsx_path, index=False)

print(f"Merged XLSX file saved to {merged_xlsx_path}")
Merged XLSX file saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/RQ023682_merged.xlsx

SAM & BAM Files compression (gzip, tar) ¶

InĀ [29]:
def compress_and_remove(file):
    with open(file, 'rb') as f_in:
        with gzip.open(file + '.gz', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(file)

def compress_and_create_tar(files, tar_filename, folder_name):
    with tarfile.open(tar_filename, 'w:gz') as tar:
        for file in files:
            compressed_filename = file + '.gz'
            with gzip.open(compressed_filename, 'wb') as f_out:
                with open(file, 'rb') as f_in:
                    shutil.copyfileobj(f_in, f_out)
            tar.add(compressed_filename, arcname=os.path.join(folder_name, os.path.basename(compressed_filename)))
            os.remove(compressed_filename)

if __name__ == '__main__':
    compression_handle = " "  # Set this to "Yes" to execute the compression and archiving
    
    if compression_handle == "Yes":
        sam_files = glob.glob(os.path.join(output_sam_dir, "*.sam"))
        bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam"))

        with ProcessPoolExecutor() as executor:
            completed_tasks = 0
            total_tasks = len(sam_files) + len(bam_files)

            for result in executor.map(compress_and_remove, sam_files + bam_files):
                completed_tasks += 1
                print(f'Compressing: {completed_tasks}/{total_tasks}', end='\r')

        compressed_sam_files = glob.glob(os.path.join(output_sam_dir, "*.sam.gz"))
        compressed_bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam.gz"))

        if not os.path.exists(os.path.join(base_dir, output_sam_dir, f"{directory}_sam_compressed.tar.gz")):
            compress_and_create_tar(
                compressed_sam_files,
                os.path.join(base_dir, output_sam_dir, f"{directory}_sam_compressed.tar.gz"),
                output_sam_dir
            )

        if not os.path.exists(os.path.join(base_dir, output_bam_dir, f"{directory}_bam_compressed.tar.gz")):
            compress_and_create_tar(
                compressed_bam_files,
                os.path.join(base_dir, output_bam_dir, f"{directory}_bam_compressed.tar.gz"),
                output_bam_dir
            )

        for file in compressed_sam_files:
            os.remove(file)
        for file in compressed_bam_files:
            os.remove(file)
    else:
        print("Compression handle is not set")
Compression handle is not set

SAM & BAM Files decompression (optional) ¶

InĀ [30]:
# Set the decompression handle here
decompression_handle = " "  # Set this to "yes" to enable decompression

def decompress_and_remove_tar(tar_filename, output_dir):
    with tarfile.open(tar_filename, 'r:gz') as tar:
        tar.extractall(output_dir)
    
    os.remove(tar_filename)

def decompress_gz_file(gz_file):
    with gzip.open(gz_file, 'rb') as f_in:
        original_filename = gz_file[:-3]  # Remove .gz extension
        with open(original_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(gz_file)

if __name__ == '__main__':
    # Specify the paths to the tar.gz files and the output directories
    sam_tar_file = os.path.join(base_dir, output_sam_dir, f"{directory}_sam_compressed.tar.gz")
    bam_tar_file = os.path.join(base_dir, output_bam_dir, f"{directory}_bam_compressed.tar.gz")

    sam_output_dir = output_sam_dir  # Output directory for SAM files
    bam_output_dir = output_bam_dir  # Output directory for BAM files

    if decompression_handle == "yes":
        decompress_and_remove_tar(sam_tar_file, sam_output_dir)
        decompress_and_remove_tar(bam_tar_file, bam_output_dir)

        sam_gz_files = glob.glob(os.path.join(sam_output_dir, "*.sam.gz"))
        bam_gz_files = glob.glob(os.path.join(bam_output_dir, "*.bam.gz"))

        with ProcessPoolExecutor() as executor:
            executor.map(decompress_gz_file, sam_gz_files)
            executor.map(decompress_gz_file, bam_gz_files)
    else:
        print("Decompression handle is not set")
Decompression handle is not set

Normalization ¶

Installing R packages (optional) ¶

InĀ [31]:
# Set install_R_handle to "yes" to enable package installation
install_R_handle = ""

if install_R_handle == "yes":
    # Install required packages
    # robjects.r('install.packages("BiocManager", repos="http://cran.r-project.org")')

    # Set the C++ compiler version
    os.environ['CXX'] = 'g++-9'

    # Install packages for normalization
    bioc_manager = robjects.packages.importr("BiocManager")
    bioc_manager.install("limma")
    bioc_manager.install("edgeR")
    robjects.r('install.packages("DescTools")')
else:
    print("Install R Handle is not set")
Install R Handle is not set
InĀ [32]:
from rpy2.robjects.packages import importr
from rpy2.robjects import r

# Load packages
base = importr('base')
limma = importr("limma")
edgeR = importr("edgeR")

# Check versions
limma_version = r('packageVersion("limma")')
edgeR_version = r('packageVersion("edgeR")')

print("limma version:", limma_version[0])
print("edgeR version:", edgeR_version[0])
limma version: [1]  3 58  1

edgeR version: [1]  4  0 16

Import R packages ¶

InĀ [33]:
# Activate R's base package
base = importr('base')

# Load packages
limma = importr("limma")
edgeR = importr("edgeR")
stats = importr("stats")
Desctools = importr("DescTools")

graphics = importr("graphics")

#Cheking current library Path
robjects.r(".libPaths()")

# Set R's working directory
robjects.r(f'setwd("{base_dir}")')
Out[33]:
array(['/media/kimlab/DATA1/harryjo/Reference/9.1_delta'], dtype='<U47')

Import functions ¶

InĀ [34]:
# Changing pandas dataframe to R dataframe
def df_to_r_dataframe(df):
    with (robjects.default_converter + pandas2ri.converter).context():
        r_df = robjects.conversion.get_conversion().py2rpy(df)
    return r_df

# Limma quantile normaization 
def limma_normalizeQuantiles(r_dataframe, ties=False):
    normalized_data = limma.normalizeQuantiles(r_dataframe, ties=False)
    return normalized_data

# Winsorization 
lower_quantile = 0.00001
upper_quantile = 0.99999
def winsorize_func(r_dataframe, lower_quantile, upper_quantile):
    colnames = list(r_dataframe.colnames)
    robjects.r('winsorized_cols <- list()')

    for col in colnames:
        col_str = str(col)
        robjects.r.assign("col_data", r_dataframe.rx2(col_str))

        robjects.r(f'''
            quantiles <- quantile(col_data, probs = c({lower_quantile}, {upper_quantile}), na.rm = TRUE)
            winsorized_cols[["{col_str}"]] <- DescTools::Winsorize(
                as.numeric(col_data),
                val = quantiles
            )
        ''')

    result = robjects.r('as.data.frame(winsorized_cols, check.names = FALSE)')
    
    # Convert R dataframe to pandas dataframe
    result_pd_df = robjects.conversion.rpy2py(result)
        
    # Reset index to start from 0
    result_pd_df.reset_index(drop=True, inplace=True)
    
    return result_pd_df

# Saving processed dataframe (without indices) with indices from fasta_data
# Path to the output CSV file
fasta_data = pd.read_csv(fasta_file)
# Cleaning out dAAVS1 and pDest from the dataframe 
exclude_keywords = ["dAAVS1", "pDest"]
fasta_data = fasta_data[~fasta_data.apply(lambda row: any(keyword in str(row) for keyword in exclude_keywords), axis=1)]
# Changing ORF_ID columns to number and sort the dataframe, reset index
fasta_data.loc[:, "ORF_ID"] = pd.to_numeric(fasta_data["ORF_ID"], errors="coerce")
fasta_data = fasta_data.sort_values("ORF_ID").reset_index(drop=True)
# Remove unwanted columns
drop_columns = ["ID", "Length"]
fasta_data = fasta_data.drop(drop_columns, axis=1)
# Reset index to include inside dataframe
fasta_data = fasta_data.reset_index()
# Change dataframe type to float
cols_to_convert = fasta_data.columns[~fasta_data.columns.isin(['Group', 'Gene_Symbol'])]
fasta_data[cols_to_convert] = fasta_data[cols_to_convert].astype(float)

def save_dataframe(index_df: pd.DataFrame, analyzed_df: pd.DataFrame, output_path: str) -> pd.DataFrame:
    """
    Save the merged DataFrame to an Excel file.

    Parameters:
        index_df (pd.DataFrame): DataFrame to be used as the index.
        analyzed_df (pd.DataFrame): DataFrame to be analyzed and merged.
        output_path (str): Output file path for saving the merged DataFrame.

    Returns:
        pd.DataFrame: The merged DataFrame.
    """
    # Reset index for the analyzed_df
    analyzed_df_indexed = analyzed_df.reset_index().astype(float)
    
    # Perform the merge based on column indices
    merged_df = index_df.merge(analyzed_df_indexed, on='index')
    merged_df.replace(['nan'], np.nan, inplace=True)
    merged_df.set_index('index', inplace=True)
    merged_df.index.name = None

    # Saving the merged DataFrame to xlsx
    merged_df.to_excel(output_path, index=False)
    print(f"DataFrame saved to {output_path}")
    return merged_df

# For TMM / GeTMM normalization
def edgeR_normfactor(data, handle):
    experimental_columns = data.columns
    
    if handle == "Control":
        # Define your control and experimental keywords
        neg_control_keywords = ["DMSO", "Baseline"]
        none_keywords = ["mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative", "Serumfree"] 
        # Determine group names based on column names
        group_names = [
            "Control" if any(keyword in col for keyword in neg_control_keywords)
            else ("None" if any(keyword in col for keyword in none_keywords)
            else "Experimental")
            for col in experimental_columns
        ]
    elif handle == "Triplet":
        # Extract the second part of each column name after splitting by underscore
        extracted_keywords = [col.split('-')[1] for col in experimental_columns]
        group_names = extracted_keywords
        
    group_factor = robjects.FactorVector(group_names)
    
    data_raw_r = df_to_r_dataframe(data)
    
    dge = edgeR.DGEList(counts=data_raw_r, group=group_factor)
    dge = edgeR.calcNormFactors(dge, method="TMM")
    dge_normfactors_r = dge.rx2('samples')
    
    # dge_normfactor to Pandas dataframe
    with (robjects.default_converter + pandas2ri.converter).context():
        dge_normfactors_df = robjects.conversion.get_conversion().rpy2py(dge_normfactors_r)
    
    norm_raw = edgeR.cpm(dge)
    norm_log = edgeR.cpm(dge, log=True)
    
    norm_colnames = list(data_raw_r.colnames)
    
    return dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r

Gene count summary ¶

InĀ [35]:
# Read the table from XLSX
data_rc = pd.read_excel(f"{merged_xlsx_path}")

# Cleaning out dAAVS1 and pDest from the dataframe 
exclude_keywords = ["dAAVS1", "pDest"]
gene_data = data_rc[~data_rc.apply(lambda row: any(keyword in str(row) for keyword in exclude_keywords), axis=1)]

# Select data based on header values
columns_to_include = [column for column in data_rc.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
gene_data = gene_data[columns_to_include]

# Create the sample_color_list using columns_to_include
sample_color_list = {column: drug_color_map[column.split("-")[1]] for column in columns_to_include}

# Count the number of genes per column
gene_counts = gene_data.count()

# Calculate gene count statistics
max_gene_count = gene_counts.max()
min_gene_count = gene_counts.min()
avg_gene_count = gene_counts.mean().astype(int)
half_of_max_count = int(max_gene_count / 2)

# Create a bar graph with customized colors and outline color
fig, ax = plt.subplots(figsize=(18, 9))  # Adjust the figure size as desired
gene_counts.plot(kind='bar', 
                 color=[sample_color_list.get(column) for column in gene_counts.index],
                 edgecolor=[sample_color_list.get(column) for column in gene_counts.index],
                 ax=ax)
ax.set_title('Number of Genes per Samples', fontsize=16)
ax.set_xlabel('Samples', fontsize=12)
ax.set_ylabel('Number of Genes', fontsize=12)
ax.tick_params(axis='y', labelsize=12)  # Set font size for y-axis labels
ax.tick_params(axis='x', labelsize=12)  # Set font size for x-axis labels

# Rotate x-axis labels if needed
plt.xticks(rotation=90)

# Marking any y-axis that has less than half of the maximum count
plt.axhline(y= half_of_max_count, color='black', linewidth=3)

# Set the background color and gridlines
ax.set_facecolor('white')
ax.grid(False)

# Define the Gene Count graph file path for graph storing
Gene_count_path = os.path.join(graphs_files, f"{directory}_Gene_Count.svg")
plt.savefig(Gene_count_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_Gene_Count.svg saved to {Gene_count_path}")

# Print statistics
print("Maximum Gene Count:", max_gene_count)
print("Half of Maximum Gene Count:", half_of_max_count)
print("Minimum Gene Count:", min_gene_count)
print("Average Gene Count:", avg_gene_count)

plt.close()
# plt.show()
RQ023682_Gene_Count.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/RQ023682_Gene_Count.svg
Maximum Gene Count: 15064
Half of Maximum Gene Count: 7532
Minimum Gene Count: 1627
Average Gene Count: 13100

Read count normalization (EdgeR, Bioinfokit) ¶

InĀ [36]:
# Read the table from XLSX
data_rc = pd.read_excel(f"{merged_xlsx_path}")

# Cleaning out dAAVS1 and pDest from the dataframe 
exclude_keywords = ["dAAVS1", "pDest"]
data_nor = data_rc[~data_rc.apply(lambda row: any(keyword in str(row) for keyword in exclude_keywords), axis=1)]

# Changing ORF_ID columns to number and sort the dataframe
data_nor.loc[:, "ORF_ID"] = pd.to_numeric(data_nor["ORF_ID"], errors="coerce")
data_nor = data_nor.sort_values("ORF_ID").reset_index(drop=True)

# Select data based on header values
columns_to_include = [column for column in data_nor.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
data_col = data_nor[columns_to_include]

# Add length for length normalization
gene_info = data_nor[["Gene_Symbol", "Length"]]
gene_merge = gene_info.merge(data_col, left_index=True, right_index=True)

# Set dataframes for gene expression normalziation 
data_edgeR = data_col
data_bioinfokit = gene_merge

# norm function associated with scipy.
from bioinfokit.analys import norm

# Define the normalization method you want to use: 'CPM', 'TMM', 'GeTMM' or 'TPM', 'RPKM'
chosen_normalization = 'TMM'  # Change this to your desired normalization method\
    
if chosen_normalization == 'CPM':
    # Load your data and replace NaN with 0
    data_cpm = data_edgeR.fillna(0)

    # Convert pandas DataFrame to R data frame
    data_raw_r = df_to_r_dataframe(data_cpm)

    # Calculate CPM in R
    norm_raw = edgeR.cpm(data_raw_r)
    norm_log = edgeR.cpm(data_raw_r, log=True)

    # Access column names
    norm_colnames = list(data_raw_r.colnames)

elif chosen_normalization in ['GeTMM', 'TMM']:
    if chosen_normalization == 'GeTMM':
        # Load your data and replace NaN with 0 and drop Gene_Symbol
        data_norm = data_bioinfokit.fillna(0)
        data_norm = data_norm.drop(['Gene_Symbol'], axis=1)
        
        # Adding length for the length normalization 
        data_norm['Length'] = data_norm['Length'] / 10**3
        data_norm.iloc[:, 1:] = data_norm.iloc[:, 1:].div(data_norm['Length'], axis=0)
        
        # Select data based on header values
        columns_to_include = [col for col in data_bioinfokit.columns if any(any(sample in col for sample in samples) for samples in sample_key.values())]
        data_norm = data_norm[columns_to_include]
    
    elif chosen_normalization == 'TMM':
        data_norm = data_edgeR.fillna(0)
    
    # Set the GeTMM/TMM handle
    getmm_tmm_handle = "Control"  # Change this to your desired normalization factors
    
    # Perform normalization and get results
    dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r = edgeR_normfactor(data_norm, getmm_tmm_handle)

elif chosen_normalization in ['RPKM', 'TPM']:
    if chosen_normalization == 'RPKM':
        nm_method = 'rpkm'
    elif chosen_normalization == 'TPM':
        nm_method = 'tpm'
    
    data_norm = data_bioinfokit.fillna(0)

    # Convert 'Length' column to numeric (if it's not already)
    data_norm['Length'] = pd.to_numeric(data_norm['Length'], errors='coerce')

    # Make 'Gene_Symbol' column as the index column
    data_norm.set_index('Gene_Symbol', inplace=True)
    
    nm = norm()
    getattr(nm, nm_method)(df=data_norm, gl='Length')

    # Get the normalized DataFrame
    nor_df = getattr(nm, f'{nm_method}_norm')

    # Reset index back to default integer index
    nor_df.reset_index(drop=True, inplace=True)
    nor_raw = nor_df

    # Calculate the logarithm of nor values (base 2, with a small constant added)
    avoid_nan = 0.18050946883  # Mimic edgeR cpm(log=True)
    nor_log = (np.log2(nor_raw + avoid_nan)).astype(float)

    # Convert nor_raw and nor_log DataFrame to array
    norm_raw = nor_raw.values
    norm_log = nor_log.values

    # Access column names
    norm_colnames = nor_raw.columns.tolist()

else:
    raise ValueError(f"Invalid normalization method: {chosen_normalization}. Choose 'CPM', 'TMM', 'GeTMM' or 'TPM', 'RPKM'.")

# cpm_raw to dataframe for storage
nor_raw_df = pd.DataFrame(data=np.where(norm_raw != 0, norm_raw, np.nan), columns=norm_colnames)
nor_log_df = pd.DataFrame(data=np.where(norm_log != 0, norm_log, np.nan), columns=norm_colnames)

# Saving the merged DataFrame
nor_raw_path = os.path.join(database_files_original, f"{directory}_nor_raw.xlsx")
nor_raw_compile_df = save_dataframe(fasta_data, nor_raw_df, nor_raw_path)

nor_log_path = os.path.join(database_files_original, f"{directory}_nor_log.xlsx")
nor_log_compile_df = save_dataframe(fasta_data, nor_log_df, nor_log_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_nor_raw.xlsx
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_nor_log.xlsx

Noise detection ¶

InĀ [37]:
# Histogram of log2 to determine proper cutoff
median_log2_nor = base.apply(norm_log, 1, stats.median)
graphics.hist(median_log2_nor)
expr_cutoff = -1.0
graphics.abline(v=expr_cutoff, col="red", lwd=3)
expr_count = base.sum(FloatVector(np.array(median_log2_nor) > expr_cutoff))[0]

# Plot histogram using Python
plt.figure(figsize=(18, 9))  # Adjust the figure size as desired

# Calculate the bin width and adjust the bar width and spacing
num_bins = 50
data_range = np.ptp(median_log2_nor)
bin_width = data_range / num_bins
bar_width = 3.0 * bin_width
bar_spacing = bin_width - bar_width

# Plot the histogram with adjusted bar width and spacing
plt.hist(median_log2_nor, bins=num_bins, range=(np.min(median_log2_nor), np.max(median_log2_nor)),
         color='black', edgecolor='black', linewidth=0.5,
         rwidth=bar_width, align='mid')

# Set the background color and gridlines
plt.gca().set_facecolor('lightgray')
plt.grid(color='white', linestyle='-', linewidth=0.5)

plt.axvline(x=expr_cutoff, color='red', linewidth=3)
plt.title('Histogram of log2 nor', fontsize=16)
plt.xlabel('Log2 nor', fontsize=16)
plt.ylabel('No of Genes in log2', fontsize=16)
plt.yscale('log')
plt.tick_params(axis='both', labelsize=12) 

print("Total number of genes after the cutoff:", int(expr_count))

# Define the nor Histogram graph file path for graph storing
nor_histogram_path = os.path.join(graphs_files_original, f"{directory}_norm_histogram.svg")
plt.savefig(nor_histogram_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_nor_Histogram.svg saved to {nor_histogram_path}")

# plt.close()
plt.close()
Total number of genes after the cutoff: 14403
RQ023682_nor_Histogram.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_norm_histogram.svg

Noise removal from dataframe ¶

InĀ [38]:
# Convert median_log2_nor to a numpy array
median_log2_nor_np = np.array(median_log2_nor)

# Create a boolean mask based on the expression cutoff
mask = median_log2_nor_np > expr_cutoff
unmask = median_log2_nor_np <= expr_cutoff

# Get the indices where the mask is True
indices = np.where(mask)[0]
# Get the indices where the mask is not True
non_indices = np.where(unmask)[0]  

# Subset nor_raw based on the indices
nor_clean = norm_raw[indices, :]
# Subset nor_raw based on not in the indices
nor_unclean = norm_raw[non_indices, :]

# Access column names
if chosen_normalization in ['RPKM', 'TPM']:
    nor_colnames = nor_raw.columns.tolist()
elif chosen_normalization  in ['CPM', 'TMM', 'GeTMM']:
    nor_colnames = list(data_raw_r.colnames)

# nor_raw to dataframe for storage
nor_clean_df = pd.DataFrame(data=np.where(nor_clean != 0, nor_clean, np.nan), index=indices, columns=nor_colnames)
nor_unclean_df = pd.DataFrame(data=np.where(nor_unclean != 0, nor_unclean, np.nan), index=non_indices, columns=nor_colnames)

# Saving the merged DataFrame
nor_clean_path = os.path.join(database_files_original, f"{directory}_nor_clean.xlsx")
nor_clean_compile_df = save_dataframe(fasta_data, nor_clean_df, nor_clean_path)

# Saving the merged Dataframe
nor_unclean_path = os.path.join(database_files_original, f"{directory}_nor_unclean.xlsx")
nor_unclean_compile_df = save_dataframe(fasta_data, nor_unclean_df, nor_unclean_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_nor_clean.xlsx
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_nor_unclean.xlsx

Correlation matrix clustering (Pre-normalization) ¶

InĀ [39]:
# Get the column names from data_raw
col_names = nor_clean_df.columns.values.tolist()

# Calculate the correlation matrix in R
cor_matrix = stats.cor(nor_clean, use="everything", method ="pearson")

# Convert the correlation matrix to a NumPy array
cor_matrix_np = np.asarray(cor_matrix)

# Compute the condensed distance matrix
dist_matrix = pdist(cor_matrix_np)

# Perform hierarchical clustering with the condensed distance matrix
linkage_matrix = hierarchy.linkage(dist_matrix, method='average')
dendrogram_row = hierarchy.dendrogram(linkage_matrix, no_plot=True)

# Get the order of rows and columns from the dendrogram
order_row = dendrogram_row['leaves']
order_col = dendrogram_row['leaves']

# Reorder the correlation matrix based on the clustering
cor_matrix_ordered = cor_matrix_np[order_row][:, order_col]

# Perform hierarchical clusterisng and plot the heatmap with clustering
sns.set(font_scale=0.7)
g = sns.clustermap(cor_matrix_np, cmap='coolwarm', cbar_pos=(1, 0.2, 0.03, 0.5), cbar_kws={'label': 'Correlation'},
                   dendrogram_ratio=(0.1, 0.1), linewidths=0.5,
                   xticklabels=col_names, yticklabels=col_names)

# Rotate the x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90)

# Set the title above the clustering
g.ax_heatmap.set_title('Heatmap of Correlation Matrix with Clustering',  fontsize=16, pad=20, loc='center', y=1.15)

# Define the nor Histogram graph file path for graph storing
PCC_Heatmap_path = os.path.join(graphs_files_original, f"{directory}_PCC_Heatmap.svg")
plt.savefig(PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=300)
print(f"{directory}_PCC_Heatmap.svg saved to {PCC_Heatmap_path}")

# plt.close()
plt.close()
RQ023682_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_PCC_Heatmap.svg

Removing unnecessary columns ¶

InĀ [40]:
# List of strings you want to check for in the column names
unnecessary_to_remove = ["mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative", "Serumfree"]

# List comprehension to filter out columns containing any of the strings
column_removal = [col for col in nor_clean_compile_df.columns if any(string in col for string in unnecessary_to_remove)]

# Get the common elements between 'column_removal' and DataFrame columns using 'intersection'
columns_to_drop = set(column_removal).intersection(nor_clean_compile_df.columns)

# Drop the columns from the DataFrame
nor_clean_compile_df = nor_clean_compile_df.drop(columns=columns_to_drop)

# Saving the merged DataFrame
nor_clean_removed_path = os.path.join(database_files_original, f"{directory}_nor_clean_removed.xlsx")
nor_clean_compile_df.to_excel(nor_clean_removed_path, index=False)

Box & Violin Plot (Pre-normalization) ¶

InĀ [41]:
# Removing nan values
nor_clean_compile_df.replace(['nan'], np.nan, inplace=True)

# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = nor_clean_compile_df[columns_to_include]
sample_data = sample_data.dropna(how='all').astype(float)

# Get the gene symbols corresponding to the data points
gene_symbols = nor_clean_compile_df.loc[sample_data.index, 'Gene_Symbol']

# Combine the sample data and gene symbols into a single DataFrame
data_melted = pd.concat([sample_data, gene_symbols], axis=1)

# Extract the sample names from the column headers
sample_names = sample_data.columns.str.split('_', expand=True).get_level_values(0)

# Melt the data for plotting
data_melted = data_melted.melt(id_vars=['Gene_Symbol'], value_vars=sample_names, var_name='Samples', value_name='Expression Level')

# Calculate the 10th and 90th percentile of the expression level for each sample
lower_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.1)
upper_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.9)

# Calculate the number of unique samples
num_unique_samples = len(data_melted['Samples'].unique())

# Define the sample names
include_baselline = name_list.copy()
new_value = ["Baseline", "mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"]
merged_samples_list = include_baselline + new_value
col_sample = [column for column in nor_clean_compile_df.columns if any(sample in column for sample in merged_samples_list)]

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample) - len(overlap_samples)
color_palette = sns.color_palette("Set1", n_colors=num_colors)  # Or any other color palette you prefer

# Assign colors to the remaining samples in col_sample
col_colors = dict(zip(col_sample, color_palette))
sample_color_dict.update(col_colors)

# Calculate the figsize dynamically based on the number of unique samples
fig_width = min(12, 1.5 + num_unique_samples * 1)
fig_height = min(12, 1.5 + num_unique_samples * 0.5)

# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(fig_width, fig_height))

# Plot the box plot in the first subplot
sns.boxplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[0], palette=sample_color_dict)
axs[0].set_xscale('log')
axs[0].set_title('Whisker Box Plot normal data')
axs[0].set_xlabel('Log Expression Level')
axs[0].set_ylabel('Samples')

# Plot the violin plot in the second subplot
sns.violinplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[1], inner='quart', 
            #    xticklabels=[f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)], # Use quantiles as x-axis labels
               palette=sample_color_dict
               )
axs[1].set_xscale('log')
axs[1].set_title('Violin Plot normal data')
axs[1].set_xlabel('Quantile Log Expression Level')
axs[1].set_ylabel('Samples')
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)])

# Define the nor Histogram graph file path for graph storing
Pre_Box_Violin_Plot_path = os.path.join(graphs_files_original, f"{directory}_Pre_Box_Violin_Plot.svg")
plt.savefig(Pre_Box_Violin_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_Box_Violin_Plot.svg saved to {Pre_Box_Violin_Plot_path}")

# Show the plot
plt.tight_layout()
plt.close()
# plt.show()
/tmp/ipykernel_1548459/199245737.py:73: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)])
RQ023682_Box_Violin_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Pre_Box_Violin_Plot.svg

Quantile Normalization (Limma) ¶

InĀ [42]:
# Removing first 5 columns
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
nor_clean_quant = nor_clean_compile_df[columns_to_include].astype(float)

# Convert pandas DataFrame to R data frame
nor_clean_r = df_to_r_dataframe(nor_clean_quant)

# Perform quantile normalization
nor_clean_quant_r = robjects.r['normalizeQuantiles'](nor_clean_r , ties=True)

# Change R DataFrame to pandas DataFrame
nor_clean_quant_df = robjects.conversion.rpy2py(nor_clean_quant_r)

# Saving the merged DataFrame
Qunatile_Path = os.path.join(database_files_original, f"{directory}_quatile_norm.xlsx")
Quantile_df = save_dataframe(fasta_data, nor_clean_quant_df, Qunatile_Path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_quatile_norm.xlsx

Box & Violin Plot (Post-Quantile Normalization) ¶

InĀ [43]:
# Removing nan values
Quantile_df.replace(['nan'], np.nan, inplace=True)

# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = Quantile_df[columns_to_include]
sample_data = sample_data.dropna(how='all').astype(float)

# Get the gene symbols corresponding to the data points
gene_symbols = Quantile_df.loc[sample_data.index, 'Gene_Symbol']

# Combine the sample data and gene symbols into a single DataFrame
data_melted = pd.concat([sample_data, gene_symbols], axis=1)

# Extract the sample names from the column headers
sample_names = sample_data.columns.str.split('_', expand=True).get_level_values(0)

# Melt the data for plotting
data_melted = data_melted.melt(id_vars=['Gene_Symbol'], value_vars=sample_names, var_name='Samples', value_name='Expression Level')

# Calculate the 5th and 95th percentile of the expression level
lower_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.05)
upper_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.95)

# Calculate the number of unique samples
num_unique_samples = len(data_melted['Samples'].unique())

# Define the sample names
include_baselline = name_list.copy()
new_value = "Baseline"
include_baselline.append(new_value)
col_sample = [column for column in nor_clean_compile_df.columns if any(sample in column for sample in include_baselline)]

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample) - len(overlap_samples)
color_palette = sns.color_palette("Set1", n_colors=num_colors)  # Or any other color palette you prefer

# Assign colors to the remaining samples in col_sample
col_colors = dict(zip(col_sample, color_palette))
sample_color_dict.update(col_colors)

# Calculate the figsize dynamically based on the number of unique samples
fig_width = min(12, 1.5 + num_unique_samples * 1)
fig_height = min(12, 1.5 + num_unique_samples * 0.5)

# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(fig_width, fig_height))

# Plot the box plot in the first subplot
sns.boxplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[0], palette=sample_color_dict)
axs[0].set_xscale('log')
axs[0].set_title('Whisker Box Plot normal data')
axs[0].set_xlabel('Log Expression Level')
axs[0].set_ylabel('Samples')

# Plot the violin plot in the second subplot
sns.violinplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[1], inner='quart', 
            #    xticklabels=[f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)], # Use quantiles as x-axis labels
               palette=sample_color_dict 
               )
axs[1].set_xscale('log')
axs[1].set_title('Violin Plot normal data')
axs[1].set_xlabel('Quantile Log Expression Level')
axs[1].set_ylabel('Samples')
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)])

# Define the nor Histogram graph file path for graph storing
Pro_Box_Violin_Plot_path = os.path.join(graphs_files_original, f"{directory}_Pro_Box_Violin_Plot.svg")
plt.savefig(Pro_Box_Violin_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_Box_Violin_Plot.html saved to {Pro_Box_Violin_Plot_path}")

# Show the plot
plt.tight_layout()
plt.close()
# plt.show()  
/tmp/ipykernel_1548459/3213361500.py:73: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)])
RQ023682_Box_Violin_Plot.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Pro_Box_Violin_Plot.svg

Batch correction (Limma) ¶

InĀ [44]:
# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_data = Quantile_df[columns_to_include]
batch_data = batch_data.dropna(how='all').astype(float)

# Extract the column names from the pandas DataFrame
col_headers = batch_data.columns.tolist()

# Convert pandas DataFrame to R data frame
nor_clean_quant_batch_r = df_to_r_dataframe(batch_data)

# Initialize variables
batch_assignment = []
current_batch = 0

# Function to get the batch name based on the current batch number
def get_batch_name(batch_number):
    if batch_number == 0:
        return "BatchBaseline"
    else:
        return "Batch" + str(batch_number)

# Assign batches based on the header pattern
for name in col_headers:
    if "Baseline" in name:
        current_batch += 1
    batch_assignment.append(get_batch_name(current_batch))

# Convert the batch_assignment list to an R vector
batch = robjects.vectors.StrVector(batch_assignment)

# Check if batch is not empty
if current_batch > 1:
    # Perform batch correction
    batch_corrected_data = limma.removeBatchEffect(nor_clean_quant_batch_r, batch=batch, refbatch=1)
else:
    # Skip batch correction
    batch_corrected_data = Quantile_df

# Change R DataFrame to pandas DataFrame
batch_df = pd.DataFrame(batch_corrected_data, columns=col_headers)
batch_df = batch_df.set_index(batch_data.index)

# Saving the merged DataFrame
batch_compile_path = os.path.join(database_files_original, f"{directory}_batch_corrected.xlsx")
batch_compile_df = save_dataframe(fasta_data, batch_df, batch_compile_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_batch_corrected.xlsx
InĀ [45]:
Quantile_df
Out[45]:
ORF_ID NCBI Group Gene_Symbol GC_Content 1-Baseline-batch1 2-DMSO-A1 3-DMSO-B1 4-DMSO-C1 5-Paclitaxel-A ... 60-Vinblastine-A 61-Vinblastine-B 62-Vinblastine-C 68-Baseline-batch5 69-DMSO-A 70-DMSO-B 71-DMSO-C 72-TAS102-A 73-TAS102-B 74-TAS102-C
0.0 1.0 805.0 G06 CALM2 39.111111 300.277868 210.868951 202.486238 197.923074 170.924696 ... 232.544281 112.761188 217.350953 192.534360 96.833947 116.643235 167.641227 132.742378 115.507831 79.060483
1.0 2.0 2629.0 G02 GBA 55.245189 23.713128 23.491208 19.233388 18.550245 38.450861 ... 45.954597 NaN 12.364550 20.552707 22.234654 24.369918 20.680702 64.281363 33.614174 57.161873
2.0 3.0 10282.0 G03 BET1 38.375350 179.314316 197.271195 129.053012 204.098197 201.406659 ... 182.026519 251.641328 172.638726 118.573611 118.346581 100.196028 59.911706 139.666778 126.810903 79.453204
4.0 6.0 7178.0 G02 TPT1 44.123314 102.142005 221.469150 154.558439 107.023297 85.745020 ... 80.962702 70.946782 110.521912 73.799837 67.763080 90.346526 59.579791 81.550214 45.024986 42.189666
5.0 7.0 8089.0 G01 YEATS4 36.111111 138.201549 177.538561 154.558439 141.025297 219.543618 ... 152.368022 99.246231 65.690778 163.842039 125.789194 132.942906 129.226388 139.952415 70.684885 103.838296
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18383.0 100080862.0 653427.0 delta FOXD4L5 61.280000 20.325957 41.517837 27.221601 31.855617 35.936318 ... 10.649355 NaN 5.011689 14.701741 20.622145 32.329477 44.475515 22.947815 11.018509 13.262017
18384.0 100080864.0 389058.0 delta SP5 57.239627 35.858287 32.649168 31.565979 52.378266 64.008134 ... 32.490999 NaN 26.721827 33.971673 34.418664 31.355339 28.792104 43.922590 60.544982 39.774269
18385.0 100080865.0 642623.0 delta UBTFL1 63.686636 39.788269 14.195399 2.667069 21.536820 9.469249 ... 43.190103 NaN 22.666489 85.620338 67.466986 33.336147 63.747687 9.206605 24.672902 33.623282
18386.0 100080869.0 100131980.0 delta ZNF705G 65.692308 24.680019 17.158396 37.622366 22.549472 16.422234 ... 38.541220 NaN 49.277263 32.431882 23.757016 25.840072 8.715556 37.857687 29.494166 30.927423
18387.0 100080871.0 7617.0 delta ZNF66 62.395076 1.206136 NaN NaN 0.726026 1.297132 ... NaN NaN NaN 5.406051 3.135368 2.494526 2.390596 1.583552 6.637757 1.868317

14403 rows Ɨ 73 columns

InĀ [46]:
batch_compile_df
Out[46]:
ORF_ID NCBI Group Gene_Symbol GC_Content 1-Baseline-batch1 2-DMSO-A1 3-DMSO-B1 4-DMSO-C1 5-Paclitaxel-A ... 60-Vinblastine-A 61-Vinblastine-B 62-Vinblastine-C 68-Baseline-batch5 69-DMSO-A 70-DMSO-B 71-DMSO-C 72-TAS102-A 73-TAS102-B 74-TAS102-C
0.0 1.0 805.0 G06 CALM2 39.111111 272.637273 183.228356 174.845643 170.282479 143.284100 ... 227.906496 108.123403 212.713168 240.905687 145.205273 165.014561 216.012554 181.113705 163.879157 127.431810
1.0 2.0 2629.0 G02 GBA 55.245189 28.122769 27.900849 23.643029 22.959886 42.860502 ... 36.421126 NaN 2.831079 18.588603 20.270550 22.405813 18.716598 62.317259 31.650070 55.197769
2.0 3.0 10282.0 G03 BET1 38.375350 190.436936 208.393815 140.175633 215.220817 212.529279 ... 144.025149 213.639959 134.637357 155.006645 154.779615 136.629062 96.344740 176.099812 163.243937 115.886238
4.0 6.0 7178.0 G02 TPT1 44.123314 87.667599 206.994744 140.084033 92.548891 71.270614 ... 82.369964 72.354044 111.929175 98.697817 92.661060 115.244506 84.477771 106.448194 69.922966 67.087645
5.0 7.0 8089.0 G01 YEATS4 36.111111 133.679273 173.016285 150.036163 136.503021 215.021342 ... 142.047378 88.925588 55.370135 190.302099 152.249255 159.402966 155.686448 166.412475 97.144945 130.298357
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18383.0 100080862.0 653427.0 delta FOXD4L5 61.280000 20.171238 41.363118 27.066882 31.700898 35.781599 ... 20.938496 NaN 15.300830 17.683353 23.603757 35.311088 47.457126 25.929427 14.000121 16.243628
18384.0 100080864.0 389058.0 delta SP5 57.239627 27.438108 24.228989 23.145799 43.958086 55.587954 ... 42.244861 NaN 36.475690 32.309260 32.756251 29.692926 27.129690 42.260177 58.882569 38.111856
18385.0 100080865.0 642623.0 delta UBTFL1 63.686636 38.413825 12.820956 1.292625 20.162376 8.094805 ... 43.746414 NaN 23.222800 75.820223 57.666871 23.536032 53.947572 -0.593510 14.872787 23.823167
18386.0 100080869.0 100131980.0 delta ZNF705G 65.692308 12.210429 4.688806 25.152777 10.079882 3.952644 ... 35.649124 NaN 46.385168 38.381735 29.706868 31.789924 14.665408 43.807539 35.444019 36.877275
18387.0 100080871.0 7617.0 delta ZNF66 62.395076 2.765502 NaN NaN 2.285391 2.856497 ... NaN NaN NaN 5.040778 2.770094 2.129252 2.025323 1.218279 6.272484 1.503043

14403 rows Ɨ 73 columns

Correlation matrix clustering (Post-normalization) ¶

InĀ [47]:
# Stats variable reconfirmed (Overlap between R and Scipy)
stats = importr("stats")

# Select the columns containing the sample data
batch_compile_df_corr = batch_compile_df.fillna(0)
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_compile_df_corr = batch_compile_df_corr[columns_to_include].astype(float)

# Convert pandas DataFrame to R data frame
with (robjects.default_converter + pandas2ri.converter).context():
    batch_compile_df_corr_r = robjects.conversion.get_conversion().py2rpy(batch_compile_df_corr)

# Get the column names from data_raw
col_names = batch_compile_df_corr.columns.values.tolist()

# Calculate the correlation matrix in R
cor_matrix = stats.cor(batch_compile_df_corr_r, use="everything", method ="spearman")

# Convert the correlation matrix to a NumPy array
cor_matrix_np = np.asarray(cor_matrix)

# Compute the condensed distance matrix
dist_matrix = pdist(cor_matrix_np)

# Perform hierarchical clustering with the condensed distance matrix
linkage_matrix = hierarchy.linkage(dist_matrix, method='average')
dendrogram_row = hierarchy.dendrogram(linkage_matrix, no_plot=True)

# Get the order of rows and columns from the dendrogram
order_row = dendrogram_row['leaves']
order_col = dendrogram_row['leaves']

# Reorder the correlation matrix based on the clustering
cor_matrix_ordered = cor_matrix_np[order_row][:, order_col]

# Perform hierarchical clusterisng and plot the heatmap with clustering
sns.set(font_scale=0.7)
g = sns.clustermap(cor_matrix_np, cmap='coolwarm', cbar_pos=(1, 0.2, 0.03, 0.5), cbar_kws={'label': 'Correlation'},
                   dendrogram_ratio=(0.1, 0.1), linewidths=0.5,
                   xticklabels=col_names, yticklabels=col_names)

# Rotate the x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90)

# Set the title above the clustering
g.ax_heatmap.set_title('Heatmap of Correlation Matrix with Clustering',  fontsize=16, pad=20, loc='center', y=1.15)

# Define the CPM Histogram graph file path for graph storing
Post_PCC_Heatmap_path = os.path.join(graphs_files_original, f"{directory}_Post_PCC_Heatmap.svg")
plt.savefig(Post_PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=300)
print(f"{directory}_Post_PCC_Heatmap.svg saved to {PCC_Heatmap_path}")

plt.close()
# plt.show()
RQ023682_Post_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_PCC_Heatmap.svg

Interactive 3D PCA ¶

InĀ [536]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

# Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

# Customise marker size, line width, edge colour and grid colour 
marker_size = 5
marker_linewidth = 0.2
marker_edgecolor = 'black'
color_grid = 'lightgray'

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros 
X = exp_table_norm_T.values
X = np.nan_to_num(X)

#Perform PCA with 3 components 
pca = PCA(n_components=3)
pca_3d = pca.fit_transform(X)

#Assign data to dataframe with index (sample names)
pca_3d = pd.DataFrame(pca_3d, index=exp_table_norm_T.index)

#Assign the colounms with a color per samples
pca_3d.columns = ['PC%s (%s' % (i + 1, round(pca.explained_variance_ratio_[i] * 100, 2)) + '%)' for i in range(3)]
pca_3d['Colour'] = [sample_color_dict[sample] for sample in pca_3d.index]

#Finding the total variance if necessary
total_var = pca.explained_variance_ratio_.sum() * 100

# Create a figure object
fig = go.Figure()

# Iterate through each sample and add scatter plot trace to the figure
for sample in pca_3d.index:
    pca_3d_trace = go.Scatter3d(
        x=[pca_3d.loc[sample, pca_3d.columns[0]]],  # Use the first component as 'x' values
        y=[pca_3d.loc[sample, pca_3d.columns[1]]],  # Use the second component as 'y' values
        z=[pca_3d.loc[sample, pca_3d.columns[2]]],  # Use the third component as 'z' values
        mode='markers',
        marker=dict(
            size=5,  # Adjust marker size
            line=dict(width=marker_linewidth, color=marker_edgecolor),
            color=pca_3d.loc[sample, 'Colour'],  # Use the 'Colour' column for coloring
        ),
        name=sample  # Use the sample name as the legend label
    )
    fig.add_trace(pca_3d_trace)

# Update the layout of the 3D scatter plot
fig.update_layout(
    width=1000,
    height=750,
    autosize=False,
    margin=dict(l=70, r=20, b=100, t=50, pad=0),
    template='plotly_white',
    hovermode='closest',
    scene=dict(
        xaxis=dict(
            title=pca_3d.columns[0],  # Set x-axis title using the first component column name
            gridcolor=color_grid,
            zerolinecolor=color_grid,
            showbackground=False,
            backgroundcolor='rgb(230, 230,230)'
        ),
        yaxis=dict(
            title=pca_3d.columns[1],  # Set y-axis title using the second component column name
            gridcolor=color_grid,
            zerolinecolor=color_grid,
            showbackground=False,
            backgroundcolor='rgb(230, 230, 230)'
        ),
        zaxis=dict(
            title=pca_3d.columns[2],  # Set z-axis title using the third component column name
            gridcolor=color_grid,
            zerolinecolor=color_grid,
            showbackground=False,
            backgroundcolor='rgb(230, 230,230)'
        ),
        aspectratio=dict(x=0.9, y=0.9, z=0.9),
        aspectmode='manual'
    ),
    legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)

fig.update_traces(marker=dict(size=marker_size,  # Adjust marker size
                              line=dict(width=marker_linewidth, color=marker_edgecolor)),
                  selector=dict(mode='markers'))

fig.layout.font.family = 'Arial'
fig.layout.font.size = 15
fig.layout.font.color = 'black'

# Define the nor Histogram graph file path for graph storing
PCA_3D_Plot_path = os.path.join(graphs_files_original, f"{directory}_3D_PCA.html")
fig.write_html(PCA_3D_Plot_path)
print(f"{directory}_3D_PCA.html saved to {PCA_3D_Plot_path}")

# Show the figure inline in the notebook
# fig.show()

# Close the figure
plt.close()
RQ023682_3D_PCA.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_3D_PCA.html
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/_plotly_utils/basevalidators.py:2596: DeprecationWarning:

*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/

Interactive 2D PCA ¶

InĀ [49]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

#Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

#Customise marker size, line width, edge colour and grid colour 
marker_size = 10
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros 
X = exp_table_norm_T.values
X = np.nan_to_num(X)

# Perform PCA with 2 components
pca = PCA(n_components=2, 
            random_state= 10000)

# Transformed data to lower dimension
pca_2d = pca.fit_transform(X)

# Assign data to dataframe with index (sample names)
pca_2d = pd.DataFrame(pca_2d, index=exp_table_norm_T.index)

# Assign the columns with a color per sample
pca_2d.columns = ['PCA%s (%s' % (i + 1, round(pca.explained_variance_ratio_[i] * 100, 2)) + '%)' for i in range(2)]
pca_2d['Colour'] = [sample_color_dict[sample] for sample in pca_2d.index]
color_discrete_map = {sample: pca_2d.loc[sample, 'Colour'] for sample in pca_2d.index}

fig = px.scatter(pca_2d, 
                 x=pca_2d.columns[0], 
                 y=pca_2d.columns[1],
                 color=pca_2d.index,  # Use tsne_df.index for coloring
                 color_discrete_map=color_discrete_map,  # Map index values to colors
                 hover_name=pca_2d.index)

fig.update_layout(
    width=1000,
    height=750,
    autosize=False,
    margin=dict(l=70, r=20, b=100, t=50, pad=0),
    template='plotly_white',
    hovermode='closest',
    xaxis=dict(
        showgrid=True,
        gridcolor=color_grid,
        zerolinecolor=color_grid,
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor=color_grid,
        zerolinecolor=color_grid,
    ),
    legend=dict(
        title=dict(text='Samples')  # Change the legend title to "Sample"
    )
)

fig.update_traces(marker=dict(size=marker_size, 
                              line=dict(width=marker_linewidth,
                                        color=marker_edgecolor)),
                  selector=dict(mode='markers'))

fig.update_layout(font=dict(family='Arial', size=15, color='black'))

# Define the graph path for storing
PCA_2D_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_PCA.html")
fig.write_html(PCA_2D_Plot_path)
print(f"{directory}_2D_PCA.html saved to {PCA_2D_Plot_path}")

# Show the figure
# fig.show()
plt.close()
RQ023682_2D_PCA.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_PCA.html

Interactive 2D T-SEN ¶

InĀ [50]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

# Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

# Customise marker size, line width, edge colour, and grid colour
marker_size = 10
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)

# Perform t-SNE with 2 components
tsne_data = TSNE(n_components=2, random_state=10000, perplexity=len(col_sample_Set)-1)

tsne = tsne_data.fit_transform(X)

# Create a DataFrame with t-SNE results and sample colors
tsne_df = pd.DataFrame(tsne, index=exp_table_norm_T.index, columns=['TSNE1', 'TSNE2'])
tsne_df['Colour'] = [sample_color_dict[sample] for sample in tsne_df.index]

color_discrete_map = {sample: tsne_df.loc[sample, 'Colour'] for sample in tsne_df.index}

# Create the scatter plot using px.scatter
fig = px.scatter(tsne_df, 
                 x=tsne_df.columns[0], 
                 y=tsne_df.columns[1], 
                 color=tsne_df.index,  # Use tsne_df.index for coloring
                 color_discrete_map=color_discrete_map,  # Map index values to colors
                 hover_name=tsne_df.index)

# Update the layout of the 2D scatter plot
fig.update_layout(
    width=1000,
    height=750,
    autosize=False,
    margin=dict(l=70, r=20, b=100, t=50, pad=0),
    template='plotly_white',
    hovermode='closest',
    xaxis_title=tsne_df.columns[0],
    yaxis_title=tsne_df.columns[1],
    legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)

fig.update_traces(marker=dict(size=marker_size, 
                              line=dict(width=marker_linewidth,
                                        color=marker_edgecolor)),
                  selector=dict(mode='markers'))

# Update the font style, size, and color
fig.update_layout(font=dict(family='Arial', size=15, color='black'))

# Define the graph path for storing
TSEN_2D_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_TSEN.html")
fig.write_html(TSEN_2D_Plot_path)
print(f"{directory}_2D_TSEN.html saved to {TSEN_2D_Plot_path}")

# Show the figure
# fig.show()
plt.close()
RQ023682_2D_TSEN.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_TSEN.html

Interactive 2D UMAP ¶

InĀ [51]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

# Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

#Customise marker size, line width, edge colour and grid colour 
marker_size = 15
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros 
X = exp_table_norm_T.values
X = np.nan_to_num(X)

# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=2, 
                      random_state=77,
                      n_epochs = 500
                      )
umap_fits = umap_data.fit_transform(X)

# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)

# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(2)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]
color_discrete_map = {sample: umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}

# Assign the colums with colr per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]

# Create the scatter plot using px.scatter
fig = px.scatter(umap_2d_df, 
                 x=umap_2d_df.columns[0], 
                 y=umap_2d_df.columns[1], 
                 color=umap_2d_df.index,  # Use umap_2d_df.index for coloring
                 color_discrete_map=color_discrete_map,  # Map index values to colors
                 hover_name=umap_2d_df.index)

# Update the layout of the 2D scatter plot
fig.update_layout(
    width=1000,
    height=750,
    autosize=False,
    margin=dict(l=70, r=20, b=100, t=50, pad=0),
    template='plotly_white',
    hovermode='closest',
    xaxis_title=umap_2d_df.columns[0],
    yaxis_title=umap_2d_df.columns[1],
    legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)

fig.update_traces(marker=dict(size=marker_size, 
                              line=dict(width=marker_linewidth,
                                        color=marker_edgecolor)),
                  selector=dict(mode='markers'))

# Update the font style, size, and color
fig.update_layout(font=dict(family='Arial', size=15, color='black'))

# Define the graph path for storing
UMAP_2D_Interactive_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.html")
fig.write_html(UMAP_2D_Interactive_Plot_path)
print(f"{directory}_2D_UMAP.html saved to {UMAP_2D_Interactive_Plot_path}")

# Show the figure
# fig.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning:

n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

RQ023682_2D_UMAP.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.html
InĀ [52]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

# Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

#Customise marker size, line width, edge colour and grid colour 
marker_size = 15
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros 
X = exp_table_norm_T.values
X = np.nan_to_num(X)

# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3, 
                      random_state=77,
                      n_epochs = 500
                      )
umap_fits = umap_data.fit_transform(X)

# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)

# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]
color_discrete_map = {sample: umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}

# Assign the colums with colr per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]

# Create the scatter plot using px.scatter
fig = px.scatter(umap_2d_df, 
                 x=umap_2d_df.columns[0], 
                 y=umap_2d_df.columns[1], 
                 color=umap_2d_df.index,  # Use umap_2d_df.index for coloring
                 color_discrete_map=color_discrete_map,  # Map index values to colors
                 hover_name=umap_2d_df.index)

# Update the layout of the 2D scatter plot
fig.update_layout(
    width=1000,
    height=750,
    autosize=False,
    margin=dict(l=70, r=20, b=100, t=50, pad=0),
    template='plotly_white',
    hovermode='closest',
    xaxis_title=umap_2d_df.columns[0],
    yaxis_title=umap_2d_df.columns[1],
    legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)

fig.update_traces(marker=dict(size=marker_size, 
                              line=dict(width=marker_linewidth,
                                        color=marker_edgecolor)),
                  selector=dict(mode='markers'))

# Update the font style, size, and color
fig.update_layout(font=dict(family='Arial', size=15, color='black'))

# Define the graph path for storing
UMAP_2D_Interactive_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.html")
fig.write_html(UMAP_2D_Interactive_Plot_path)
print(f"{directory}_2D_UMAP.html saved to {UMAP_2D_Interactive_Plot_path}")

# Show the figure
# fig.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning:

n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

RQ023682_2D_UMAP.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.html
InĀ [53]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

# Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

#Customise marker size, line width, edge colour and grid colour 
marker_size = 15
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros 
X = exp_table_norm_T.values
X = np.nan_to_num(X)

# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3, 
                      random_state=77,
                      n_epochs = 500
                      )
umap_fits = umap_data.fit_transform(X)

# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)

# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]
color_discrete_map = {sample: umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}

# Assign the colums with colr per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]

# Create the scatter plot using px.scatter
fig = px.scatter(umap_2d_df, 
                 x=umap_2d_df.columns[2], 
                 y=umap_2d_df.columns[1], 
                 color=umap_2d_df.index,  # Use umap_2d_df.index for coloring
                 color_discrete_map=color_discrete_map,  # Map index values to colors
                 hover_name=umap_2d_df.index)

# Update the layout of the 2D scatter plot
fig.update_layout(
    width=1000,
    height=750,
    autosize=False,
    margin=dict(l=70, r=20, b=100, t=50, pad=0),
    template='plotly_white',
    hovermode='closest',
    xaxis_title=umap_2d_df.columns[2],
    yaxis_title=umap_2d_df.columns[1],
    legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)

fig.update_traces(marker=dict(size=marker_size, 
                              line=dict(width=marker_linewidth,
                                        color=marker_edgecolor)),
                  selector=dict(mode='markers'))

# Update the font style, size, and color
fig.update_layout(font=dict(family='Arial', size=15, color='black'))

# Define the graph path for storing
UMAP_2D_Interactive_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.html")
fig.write_html(UMAP_2D_Interactive_Plot_path)
print(f"{directory}_2D_UMAP.html saved to {UMAP_2D_Interactive_Plot_path}")

# Show the figure
# fig.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning:

n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

RQ023682_2D_UMAP.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.html

2D UMAP (Two legends) ¶

InĀ [54]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

#Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

#Customise marker size, line width, edge colour and grid colour 
marker_size = 20
marker_linewidth = 5

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros 
X = exp_table_norm_T.values
X = np.nan_to_num(X)

# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=2, 
                      random_state=77,
                      n_epochs = 500
                      )
umap_fits = umap_data.fit_transform(X)

# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)

# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(2)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]

# Create a dictionary mapping samples to colors
sample_color_dict = {sample.split('-')[1] : umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}

# Assign the columns with color per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]

# Create a list of drugs to exclude
control_to_exclude = ['DMSO', 'Baseline', 'mCherryPositive&BFPNegative', "mCherryNegative&BFPNegative", 'Serumfree']
umap_2d_df = umap_2d_df[~umap_2d_df.index.str.contains('|'.join(control_to_exclude))]

# Filter out the rows corresponding to the drugs to exclude
drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
umap_2d_df = umap_2d_df.loc[~umap_2d_df.index.str.strip().isin(drug_to_exclude)]

# Create the scatter plot using plt.scatter
plt.figure(figsize=(20, 16))
scatterplot = plt.scatter(umap_2d_df['UMAP1'], 
                          umap_2d_df['UMAP2'], 
                          c=umap_2d_df['Colour'], 
                          edgecolors=umap_2d_df['DrugCategory_color'], 
                          s=200,
                          linewidth=3.0
                          )

# Custom legend for common drugs with their marker colors
common_drugs = [index.split('-')[1] for index in umap_2d_df.index]
common_drugs_unique = list(OrderedDict.fromkeys(common_drugs))  # Convert to a list of unique drug names
common_drug_legend_handles = []
for drug in common_drugs_unique:
    drug_color = umap_2d_df.loc[umap_2d_df.index.str.contains(drug), 'Colour'].iloc[0]
    legend_label = drug
    common_drug_legend_handles.append(Patch(facecolor=drug_color, edgecolor='none', label=legend_label))

# Custom legend for drug categories with their colors
drug_categories = umap_2d_df['DrugCategory'].unique()
drug_categories_color = umap_2d_df['DrugCategory_color'].unique()
drug_category_legend_handles = []
for category, edge_color in zip(drug_categories, drug_categories_color):
    drug_category_legend_handles.append(Patch(facecolor='none', edgecolor=edge_color, linewidth = 3, label=category))

# Customize the graph 
ax = plt.gca()
ax.set(facecolor = "white")
ax.grid(color='#F5F5F5')  # Set gridline color to black
ax.tick_params(axis='both', labelsize=14)
ax.set_xlabel('UMAP1', fontsize=14)  # Change font size for x-axis label
ax.set_ylabel('UMAP2', fontsize=14)  # Change font size for y-axis label

# Show the custom legends
common_legend = ax.legend(handles=common_drug_legend_handles, 
                          bbox_to_anchor=(1.05, 1), 
                          loc='upper left', 
                          ncol=1, 
                          prop ={'size': 15}, 
                          title="Sample Color",
                          title_fontsize= 16,
                          facecolor = 'white',
                          borderaxespad=0.)
drug_category_legend = ax.legend(handles=drug_category_legend_handles, 
                                 bbox_to_anchor=(1.05, 0.5), 
                                 loc='upper left', 
                                 ncol=1, 
                                 prop ={'size': 15}, 
                                 title="Category Color", 
                                 title_fontsize= 16,
                                 facecolor = 'white',
                                 borderaxespad=0.)
ax.add_artist(common_legend)

# Saving the figure
UMAP_2D_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.svg")
plt.savefig(UMAP_2D_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_2D_UMAP.svg saved to {UMAP_2D_path}")

# Show the figure
# plt.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning:

n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

RQ023682_2D_UMAP.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.svg

2D UMAP (Single legend) ¶

InĀ [55]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

#Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

#Customise marker size, line width, edge colour and grid colour 
marker_size = 50
marker_linewidth = 5

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros 
X = exp_table_norm_T.values
X = np.nan_to_num(X)

# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3, 
                      random_state = 77,
                      n_epochs = 500
                      )
umap_fits = umap_data.fit_transform(X)

# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)

# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]

# Create a dictionary mapping samples to colors
sample_color_dict = {sample.split('-')[1] : umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}

# Assign the columns with color per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]

# Create a list of drugs to exclude
control_to_exclude = [
                        # 'DMSO',
                        'Baseline', 
                        'mCherryPositive&BFPNegative',
                        "mCherryNegative&BFPNegative", 
                        'Serumfree'
                        ]
umap_2d_df = umap_2d_df[~umap_2d_df.index.str.contains('|'.join(control_to_exclude))]

# Filter out the rows corresponding to the drugs to exclude
drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
umap_2d_df = umap_2d_df.loc[~umap_2d_df.index.str.strip().isin(drug_to_exclude)]

# Create the scatter plot using plt.scatter
plt.figure(figsize=(10, 10))
scatterplot = plt.scatter(umap_2d_df['UMAP1'], 
                          umap_2d_df['UMAP2'], 
                          c=umap_2d_df['Colour'], 
                          edgecolors=umap_2d_df['DrugCategory_color'], 
                          s=200,
                          linewidth=3.0
                          )

# Custom legend for common drugs with their marker colors
common_drugs = [index.split('-')[1] for index in umap_2d_df.index]
common_drugs_unique = list(OrderedDict.fromkeys(common_drugs))  # Convert to a list of unique drug names
common_drug_legend_handles = []
for drug in common_drugs_unique:
    drug_color = umap_2d_df.loc[umap_2d_df.index.str.contains(drug), 'Colour'].iloc[0]
    legend_label = drug
    common_drug_legend_handles.append(Patch(facecolor=drug_color, edgecolor='none', label=legend_label))

# Custom legend for drug categories with their colors
drug_categories = umap_2d_df['DrugCategory'].unique()
drug_categories_color = umap_2d_df['DrugCategory_color'].unique()
drug_category_legend_handles = []
for category, edge_color in zip(drug_categories, drug_categories_color):
    drug_category_legend_handles.append(Patch(facecolor='none', edgecolor=edge_color, linewidth = 3, label=category))

# Calculate the range of UMAP1 and UMAP2
umap1_range = umap_2d_df['UMAP1'].max() - umap_2d_df['UMAP1'].min()
umap2_range = umap_2d_df['UMAP2'].max() - umap_2d_df['UMAP2'].min()
max_range = max(umap1_range, umap2_range)

# Customize the graph 
ax = plt.gca()

# Set the axis limits with the same range for both axes
ax.set_xlim(umap_2d_df['UMAP1'].mean() - max_range / 2, umap_2d_df['UMAP1'].mean() + max_range / 2)
ax.set_ylim(umap_2d_df['UMAP2'].mean() - max_range / 2, umap_2d_df['UMAP2'].mean() + max_range / 2)

ax.grid(False)
ax.set(facecolor = "white")
ax.tick_params(axis='both', labelsize=16)
ax.set_xlabel('UMAP1', fontsize=16)  # Change font size for x-axis label
ax.set_ylabel('UMAP2', fontsize=16)  # Change font size for y-axis label

# Show the axis lines
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.spines['top'].set_visible(True)

# Set the color of the spines to black
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['right'].set_color('black')
ax.spines['top'].set_color('black')

# Set the maximum number of ticks to display on the y-axis as integers (no decimals)
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))

# Set the aspect ratio to be equal, making the axis square
plt.axis('equal')
ax.set_aspect('auto')

title_font = {'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 18}
plt.title("UMAP for 16 Chemotherapeutics and Control", fontdict=title_font)

drug_category_legend = ax.legend(handles=drug_category_legend_handles, 
                                 bbox_to_anchor=(0.02, 0.99), 
                                 loc='upper left', 
                                 ncol=1, 
                                 prop ={'size': 10}, 
                                 title="Category Color", 
                                 title_fontsize= 16,
                                 facecolor = 'white',
                                 borderaxespad=0.)

# Saving the figure
UMAP_2D_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.svg")
plt.savefig(UMAP_2D_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_2D_UMAP.svg saved to {UMAP_2D_path}")

# Show the figure
# plt.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning:

n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

RQ023682_2D_UMAP.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.svg

2D UMAP (UMAP 2 & 3)¶

InĀ [56]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

#Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

#Customise marker size, line width, edge colour and grid colour 
marker_size = 50
marker_linewidth = 5

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros 
X = exp_table_norm_T.values
X = np.nan_to_num(X)

# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3, 
                      random_state = 77,
                      n_epochs = 500
                      )
umap_fits = umap_data.fit_transform(X)

# Assign data to dataframe with index (sample names)
umap_3d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)

# Assign the columns with a color per sample
umap_3d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_3d_df['Colour'] = [sample_color_dict[sample] for sample in umap_3d_df.index]

# Create a dictionary mapping samples to colors
sample_color_dict = {sample.split('-')[1] : umap_3d_df.loc[sample, 'Colour'] for sample in umap_3d_df.index}

# Assign the columns with color per category
umap_3d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_3d_df.index]
umap_3d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_3d_df['DrugCategory']]

# Create a list of drugs to exclude
control_to_exclude = [
                        # 'DMSO',
                        'Baseline', 
                        'mCherryPositive&BFPNegative',
                        "mCherryNegative&BFPNegative", 
                        'Serumfree'
                        ]
umap_3d_df = umap_3d_df[~umap_3d_df.index.str.contains('|'.join(control_to_exclude))]

# Filter out the rows corresponding to the drugs to exclude
drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
umap_3d_df = umap_3d_df.loc[~umap_3d_df.index.str.strip().isin(drug_to_exclude)]

# Create the scatter plot using plt.scatter, but only showing UMAP2 and UMAP3
plt.figure(figsize=(10, 10))
scatterplot = plt.scatter(umap_3d_df['UMAP3'], 
                          umap_3d_df['UMAP2'], 
                          c=umap_3d_df['Colour'], 
                          edgecolors=umap_3d_df['DrugCategory_color'], 
                          s=200,
                          linewidth=3.0
                          )

# Custom legend for common drugs with their marker colors
common_drugs = [index.split('-')[1] for index in umap_3d_df.index]
common_drugs_unique = list(OrderedDict.fromkeys(common_drugs))  # Convert to a list of unique drug names
common_drug_legend_handles = []
for drug in common_drugs_unique:
    drug_color = umap_3d_df.loc[umap_3d_df.index.str.contains(drug), 'Colour'].iloc[0]
    legend_label = drug
    common_drug_legend_handles.append(Patch(facecolor=drug_color, edgecolor='none', label=legend_label))

# Custom legend for drug categories with their colors
drug_categories = umap_3d_df['DrugCategory'].unique()
drug_categories_color = umap_3d_df['DrugCategory_color'].unique()
drug_category_legend_handles = []
for category, edge_color in zip(drug_categories, drug_categories_color):
    drug_category_legend_handles.append(Patch(facecolor='none', edgecolor=edge_color, linewidth = 3, label=category))

# Calculate the range of UMAP1 and UMAP2
umap2_range = umap_3d_df['UMAP3'].max() - umap_3d_df['UMAP3'].min()
umap3_range = umap_3d_df['UMAP2'].max() - umap_3d_df['UMAP2'].min()
max_range = max(umap2_range, umap3_range)

# Customize the graph 
ax = plt.gca()

# Set the axis limits with the same range for both axes
ax.set_xlim(umap_3d_df['UMAP3'].mean() - max_range / 2, umap_3d_df['UMAP3'].mean() + max_range / 2)
ax.set_ylim(umap_3d_df['UMAP2'].mean() - max_range / 2, umap_3d_df['UMAP2'].mean() + max_range / 2)

ax.grid(False)
ax.set(facecolor = "white")
ax.tick_params(axis='both', labelsize=16)
ax.set_xlabel('UMAP3', fontsize=16)  # Change font size for x-axis label
ax.set_ylabel('UMAP2', fontsize=16)  # Change font size for y-axis label

# Show the axis lines
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.spines['top'].set_visible(True)

# Set the color of the spines to black
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['right'].set_color('black')
ax.spines['top'].set_color('black')

# Set the maximum number of ticks to display on the y-axis as integers (no decimals)
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))

# Set the aspect ratio to be equal, making the axis square
plt.axis('equal')
ax.set_aspect('auto')

title_font = {'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 18}
plt.title("UMAP for 16 Chemotherapeutics and Control", fontdict=title_font)

drug_category_legend = ax.legend(handles=drug_category_legend_handles, 
                                 bbox_to_anchor=(0.02, 0.99), 
                                 loc='upper left', 
                                 ncol=1, 
                                 prop ={'size': 10}, 
                                 title="Category Color", 
                                 title_fontsize= 16,
                                 facecolor = 'white',
                                 borderaxespad=0.)

UMAP_2D_2_3_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP_2_3.svg")
plt.savefig(UMAP_2D_2_3_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_2D_UMAP_2_3.svg saved to {UMAP_2D_2_3_path}")

plt.close()
RQ023682_2D_UMAP_2_3.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP_2_3.svg
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning:

n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

Interactive 3D UMAP¶

InĀ [57]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]

#Transpose dataframe 
exp_table_norm_T = batch_df[col_sample_Set].T

# Customise marker size, line width, edge colour and grid colour 
marker_size = 10
marker_linewidth = 0.2
marker_edgecolor = 'black'
color_grid = 'lightgray'

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]

# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)

# Assign values to a new variable and replace NaN with zeros 
X = exp_table_norm_T.values
X = np.nan_to_num(X)

# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3, 
                      random_state = 77,
                      n_epochs = 500
                      )
umap_fits = umap_data.fit_transform(X)

# Assign data to dataframe with index (sample names)
umap_3d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)

# Assign the columns with a color per sample
umap_3d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_3d_df['Colour'] = [sample_color_dict[sample] for sample in umap_3d_df.index]

# Create a dictionary mapping samples to colors
sample_color_dict = {sample.split('-')[1] : umap_3d_df.loc[sample, 'Colour'] for sample in umap_3d_df.index}

# Assign the columns with color per category
umap_3d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_3d_df.index]
umap_3d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_3d_df['DrugCategory']]

# Create a list of drugs to exclude
control_to_exclude = [
                        # 'DMSO',
                        'Baseline', 
                        'mCherryPositive&BFPNegative',
                        "mCherryNegative&BFPNegative", 
                        'Serumfree'
                        ]
umap_3d_df = umap_3d_df[~umap_3d_df.index.str.contains('|'.join(control_to_exclude))]

# Filter out the rows corresponding to the drugs to exclude
drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
umap_3d_df = umap_3d_df.loc[~umap_3d_df.index.str.strip().isin(drug_to_exclude)]

# Create a figure object
fig = go.Figure()

# Iterate through each sample and add scatter plot trace to the figure
for sample in umap_3d_df.index:
    umap_3d_trace = go.Scatter3d(
        x=[umap_3d_df.loc[sample, umap_3d_df.columns[0]]],  # Use the first component as 'x' values
        y=[umap_3d_df.loc[sample, umap_3d_df.columns[1]]],  # Use the second component as 'y' values
        z=[umap_3d_df.loc[sample, umap_3d_df.columns[2]]],  # Use the third component as 'z' values
        mode='markers',
        marker=dict(
            size=5,  # Adjust marker size
            line=dict(width=marker_linewidth, color=marker_edgecolor),
            color=pca_3d.loc[sample, 'Colour'],  # Use the 'Colour' column for coloring
        ),
        name=sample  # Use the sample name as the legend label
    )
    fig.add_trace(umap_3d_trace)

# Update the layout of the 3D scatter plot
fig.update_layout(
    width=1000,
    height=750,
    autosize=False,
    margin=dict(l=70, r=20, b=100, t=50, pad=0),
    template='plotly_white',
    hovermode='closest',
    scene=dict(
        xaxis=dict(
            title=umap_3d_df.columns[0],  # Set x-axis title using the first component column name
            gridcolor=color_grid,
            zerolinecolor=color_grid,
            showbackground=False,
            backgroundcolor='rgb(230, 230,230)'
        ),
        yaxis=dict(
            title=umap_3d_df.columns[1],  # Set y-axis title using the second component column name
            gridcolor=color_grid,
            zerolinecolor=color_grid,
            showbackground=False,
            backgroundcolor='rgb(230, 230, 230)'
        ),
        zaxis=dict(
            title=umap_3d_df.columns[2],  # Set z-axis title using the third component column name
            gridcolor=color_grid,
            zerolinecolor=color_grid,
            showbackground=False,
            backgroundcolor='rgb(230, 230,230)'
        ),
        aspectratio=dict(x=0.9, y=0.9, z=0.9),
        aspectmode='manual'
    ),
    legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)

fig.update_traces(marker=dict(size=marker_size,  # Adjust marker size
                              line=dict(width=marker_linewidth, color=marker_edgecolor)),
                  selector=dict(mode='markers'))

fig.layout.font.family = 'Arial'
fig.layout.font.size = 15
fig.layout.font.color = 'black'

# Define the nor Histogram graph file path for graph storing
UMAP_3D_Plot_path = os.path.join(graphs_files_original, f"{directory}_3D_UMAP.html")
fig.write_html(UMAP_3D_Plot_path)
print(f"{directory}_3D_UMAP.html saved to {UMAP_3D_Plot_path}")

# Showing the figure
# fig.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning:

n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

RQ023682_3D_UMAP.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_3D_UMAP.html

HDBSCAN ¶

InĀ [58]:
# Importing UMAP
from umap import UMAP

# Make copy of upmap_2d_df
HDBSCAN_df = umap_2d_df.copy()

# Extract the 'DrugCategory' column for clustering
drug_categories = HDBSCAN_df[['DrugCategory']]

# Initialize and fit the KModes clusterer
n_clusters = 4  # You can adjust the number of clusters as needed
km = KModes(n_clusters=n_clusters, init='Huang', n_init=5, verbose=1)
clusters = km.fit_predict(drug_categories)

# Add the cluster labels to your DataFrame
HDBSCAN_df['Cluster_DrugCategory'] = clusters
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 5.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 15.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 5.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 12.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 5.0
Best run was number 1
InĀ [59]:
# Define legend labels for each cluster
legend_labels = [f'Cluster {i}' for i in range(len(HDBSCAN_df['Cluster_DrugCategory'].unique()))]

# Create a scatter plot
plt.figure(figsize=(10, 6))
scatter = plt.scatter(HDBSCAN_df['UMAP1'], HDBSCAN_df['UMAP2'], c=HDBSCAN_df['Cluster_DrugCategory'], cmap='viridis')

plt.title('UMAP Projection of Clusters by Drug Category')

# Create a distance matrix
umap_result = HDBSCAN_df[['UMAP1', 'UMAP2']].values
distance_matrix = distance_matrix(umap_result, umap_result)

# Build the minimum spanning tree
mst = minimum_spanning_tree(coo_matrix(distance_matrix))

# Plot the spanning tree
edges = mst.toarray()
edge_coordinates = np.argwhere(edges)

# Plot the spanning tree
plt.plot(umap_result[edge_coordinates[:, 0], 0], umap_result[edge_coordinates[:, 0], 1], 'k-', alpha=0.2)
plt.legend(handles=scatter.legend_elements()[0], labels=legend_labels, title="Clusters")

plt.show()
No description has been provided for this image
InĀ [60]:
# Create a distance matrix
umap_result = HDBSCAN_df[['UMAP1', 'UMAP2']]

# Build the minimum spanning tree using HDBSCAN
clusterer = hdbscan.HDBSCAN(algorithm='best', min_cluster_size=2, gen_min_span_tree=True,
                            approx_min_span_tree=True, leaf_size=40, 
                            metric='euclidean', min_samples=None, p=None)
clusterer.fit(umap_result)

# # Plot the minimum spanning tree
# clusterer.minimum_spanning_tree_.plot(
#     edge_cmap='viridis',
#     edge_alpha=0.6,
#     node_size=10,
#     edge_linewidth=1
# )
clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)
# clusterer.condensed_tree_.plot()

plt.title('Minimum Spanning Tree in UMAP Space')
plt.show()
No description has been provided for this image
InĀ [61]:
# Group the data by 'DrugCategory' and calculate the median points for UMAP1 and UMAP2
category_medians = HDBSCAN_df.groupby('DrugCategory')[['UMAP1', 'UMAP2']].median()

# Perform hierarchical clustering based on the category medians using complete linkage
Z = linkage(category_medians, method='complete')

# Plot the dendrogram with custom labels and labeled Y-axis
plt.figure(figsize=(12, 6))
dendrogram(Z, orientation='top', labels=category_medians.index)
plt.title('Hierarchical Clustering Dendrogram of Drug Categories based on Medians')
plt.xticks(rotation=90)

# Label the Y-axis
plt.ylabel('Linkage Distance')

plt.show()
No description has been provided for this image

Scatter Plot ¶

InĀ [62]:
# Assign handle - only when its necessary
scatter_plot_handle = ""

if scatter_plot_handle == "Yes":
    # Assign here because it overlaps with R function of stats
    from scipy import stats

    # Define variables for marker size, color, transparency, and font sizes
    marker_size = 0.3
    marker_color = 'royalblue'
    marker_alpha = 0.6
    corr_fontsize = 8
    axis_fontsize = 10

    # Modify the default parameters of matplotlib for customization
    plt.rcParams["axes.labelsize"] = axis_fontsize
    plt.rcParams["axes.facecolor"] = 'white'
    plt.rcParams['figure.facecolor'] = 'white'
    plt.rcParams["svg.fonttype"] = 'none'

    # Define a function to create a scatterplot matrix
    def scatter_matrix_lower(df):
        # Calculate Pearson correlation coefficient between two variables
        def corrfunc(x, y, **kwargs):
            # using the 'pearsonr' function from 'stats' module in scipy
            r, _ = stats.pearsonr(x, y)
            # Add an annotation with the correlation coefficient value to the plot
            ax = plt.gca()
            ax.annotate("$p$ = {:.2f}".format(r),
                        xy=(.3, .9), xycoords=ax.transAxes, fontsize=corr_fontsize)
            #Enable gridlines
            plt.grid(True)
        
        # Create a PairGrid object from seaborn to visualize pairwise relationships
        grid = sns.PairGrid(data=df, vars=list(df), height=2)  # Increase the height parameter as per your preference
        # Plot scatterplots in the lower triangle of the PairGri
        grid.map_lower(plt.scatter, s=marker_size, color=marker_color, alpha=marker_alpha)
        # Add correlation coefficient annotations to the scatterplots
        grid.map_lower(corrfunc)

        # Modify properties of the PairGrid, such as transparency and tick marks
        grid.set(alpha=1)
        grid.set(xticks=[])
        grid.set(yticks=[])

        # Set an empty title for the figure
        grid.fig.suptitle('')

    # Call the scatter_matrix_lower function with a subset of the DataFrame
    scatter_matrix_lower(batch_df[col_sample_Set].dropna(how='any', axis=0))

    # Adjust the plot layout and display the scatterplot matrix
    plt.tight_layout()

    # Define the TPM Histogram graph file path for graph storing
    Scatter_Plot_path = os.path.join(graphs_files_original, f"{directory}_ScatterPlot_PCC.svg")
    plt.savefig(Scatter_Plot_path, format='svg')
    print(f"Merged read_summary.svg saved to {Scatter_Plot_path}")
else:
    print("Scatter Plot handle is not assigned")
Scatter Plot handle is not assigned

Venn Diagram (Verified Gene Sets) ¶

InĀ [63]:
# Import hORFeome 9.1 datasheet to get verified genes
hORFeome9_1 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20220830_hORFeome 9.1.xlsx")
verified_hORFeome9_1 = hORFeome9_1[hORFeome9_1['Verified'] == 1]

# Create a copy of stat_filtered_df
original_stat_filtered_df = batch_compile_df.copy()

# Create a dictionary mapping entrez_gene_symbol to Verified values from hORFeome9_1
gene_symbol_to_verified = hORFeome9_1.set_index('entrez_gene_symbol')['Verified'].to_dict()

# Add a new "Verified" column to original_stat_filtered_df based on Gene_Symbol using .map
original_stat_filtered_df['Verified'] = batch_compile_df['Gene_Symbol'].map(gene_symbol_to_verified)

# Get the index of the "Gene_Symbol" column
gene_symbol_index = original_stat_filtered_df.columns.get_loc('Gene_Symbol')

# Create a list of columns with the "Verified" column moved to the desired position
new_columns = list(original_stat_filtered_df.columns)
new_columns.insert(gene_symbol_index + 1, 'Verified')

# Reorder the columns in the original_stat_filtered_df DataFrame
original_stat_filtered_df = original_stat_filtered_df[new_columns]

# Drop any duplicate "Verified" columns
original_stat_filtered_df = original_stat_filtered_df.loc[:, ~original_stat_filtered_df.columns.duplicated()]

# Filter the Verified Columns
verified_stat_filtered_df = original_stat_filtered_df[original_stat_filtered_df['Verified'] == 1]

# Convert to gene sets
verified_genes = set(verified_stat_filtered_df['Gene_Symbol'])
Original_genes = set(verified_hORFeome9_1['entrez_gene_symbol'])

# Calculate the intersections
intersection = verified_genes.intersection(Original_genes)
only_verified_genes = verified_genes - Original_genes
only_original_genes = Original_genes - verified_genes

# Create the Venn diagram
fig, ax = plt.subplots(figsize=(12, 8))  # Adjust figure size to provide space for labels
venn = venn2(subsets=(len(only_verified_genes), len(only_original_genes), len(intersection)),
             set_labels=('', ''),  # Leave set labels blank as we'll add them manually
             set_colors=('cyan', 'grey'),
             ax=ax)

# Add custom labels with boxes outside the Venn diagram
plt.text(-0.9, 0.0, 'Verified Genes\nfrom analysis', fontsize=12,
         bbox=dict(facecolor='cyan', edgecolor='black', boxstyle='round,pad=0.5'))
plt.text(0.65, 0.0, 'Verified Genes\nfrom hORFeome 9.1', fontsize=12,
         bbox=dict(facecolor='lightgrey', edgecolor='black', boxstyle='round,pad=0.5'))

# Calculate the percentage of interaction
intersection_percentage = (len(intersection) / len(Original_genes)) * 100

# Calculate the percentage of interaction
intersection_percentage = (len(intersection) / len(Original_genes)) * 100

# Add the percentage text inside the plot with a box around it
plt.text(0.5, 0.6, f"Common Verified Genes: {intersection_percentage:.1f}%", 
         transform=plt.gca().transAxes,
         horizontalalignment='center', verticalalignment='center', fontsize=10,
         bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.5'))

# Add a title
plt.title("Venn Diagram for Verified Genes", fontsize=14)

# Define the file path for saving the graph
Verified_Genes_path = os.path.join(graphs_files_original, f"{directory}_Venn_Diagram(Verified_Genes).svg")
plt.savefig(Verified_Genes_path, format='svg', bbox_inches='tight', dpi=300)
print(f"Venn Diagram for Verified Genes saved to {Verified_Genes_path}")

# Show the plot
plt.close()
Venn Diagram for Verified Genes saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Venn_Diagram(Verified_Genes).svg

Statistical Analysis ¶

Function Input ¶

InĀ [64]:
def Zscore(X):
    # Standardize rows using Z-scores 
    # Z-Score : A numerical measurement that describes a value's relationship to the mean of the group of values and is mesaured in terms of standard deviations from the mean
    # X: numpy ndarray
    
    [m,n] = X.shape # m and n as number of rows and columns 
    mx = np.nanmean(X,axis=0) # Calculate the sample mean for each column, ignoring NaN values
    stdx = np.nanstd(X,axis=0) # Calculate the sample standard deviation for each column, ignoring NaN values 
    ax = np.divide(X - mx, stdx) # Calculate Z-scores to make sure each column has a standard deviation of 1
    return ax
InĀ [65]:
def stat_uneq(x, y):
    # Statistics for identifying DEGs - t-stat, rank-stat and median difference
    # [t-stat,rank,mdiff] = stat2(x,y)
    # Input arguments
    #   x : treatment data (m x nx)
    #   y : control data (m x ny)
    #   All statistical tests are performed by x-y
    # Output arguments
    #   tstat : t-statistic from two sample t-test
    #   rank : statistic from Wilcoxon's ranksum test
    #   mdiff : median difference
    [m, n] = np.shape(x) # Extract the dimensions of the input arrays 'x' and 'y'

    nx = np.sum(~np.isnan(x),axis=1) # Count sum of the non-NaN values in each row of 'x'
    ny = np.sum(~np.isnan(y),axis=1) # Count sum of the non-NaN values in each row of 'y'

    # Median difference (originally median)
    mdiff = np.nanmedian(x,axis=1) - np.nanmedian(y,axis=1) # Calculate the median difference between corresponding columns of 'x' and 'y'

    # two-sample t-test (unequal variance of two groups is assumed)
    difference = np.nanmean(x,axis=1) - np.nanmean(y,axis=1) # Calculate the mean difference between corresponding columns of 'x' and 'y'

    s2x = np.nanvar(x,axis=1) # Calculate the sample variances along each row of 'x'. 
    s2y = np.nanvar(y,axis=1) # Calculate the sample variances along each row of 'y'.
        # Variance: Measure of sample dispersion, it is a measure of how far a set of numbers is spread out from average value

    s2xbar = np.divide(s2x, nx) # Calculate the sample variance per element for 'x'.
    s2ybar = np.divide(s2y, ny) # Calculate the sample variance per element for 'y'.
        # Step is to calculate degree of freedom

    dfe = np.divide(np.power(s2xbar + s2ybar, 2), np.divide(np.power(s2xbar,2), (nx-1)) + np.divide(np.power(s2ybar,2), (ny-1))) 
        # Degree of Freedom: The maximum number of logically indepedent values, which are values that have fredome to vary, in the data sample 
    se = np.sqrt(s2xbar + s2ybar)
        # Standard Error: How far the sample variance deviates from the actual population mean. The larger the sample, the smaller the SE

    t = np.divide(difference, se) # Compute t-statistics 

    # Ranksum Statistic
    r = pd.DataFrame(np.concatenate((x, y),axis=1).T).rank(method='average',axis=0).to_numpy()
        # Rank the transposed and concatenated data, 'x' and 'y', using the 'average' ranking method and convert it to a NumPy array.
    w = np.zeros([m])
        # Initialize an array 'w' with zeros.
    wvar = np.zeros([m])
        # Initialize an array 'wvar' with zeros.

    for i in range(m):
        tmp_nx = nx[i] # Calling loop variable for nx
        tmp_ny = ny[i] # Calling loop variable for ny

        wvar[i] = ((tmp_nx * tmp_ny) * (tmp_nx + tmp_ny + 1)) / 12 # Variance calculation of the rank-sum statistic 
        if tmp_nx <= tmp_ny:
            w[i] = np.nansum(r[0:tmp_nx,i])
            # If treatment is higher than control Calculate 'w' as the sum of ranked values in the first 'tmp_nx' rows of column 'i' in 'r'.
            if np.sum(~np.isnan(r[0:tmp_nx,i])) == 0 :
                w[i] = np.nan
                wvar[i] = np.nan
            # If it is equal to 0, set 'w' and 'wvar' to NaN.
        elif tmp_nx > tmp_ny:
            w[i] = np.nansum(r[tmp_nx:(tmp_nx + tmp_ny),i])
            # If control is higher than treatment, Calculate 'w' as the sum of ranked values in between tmp_nx and tmp_ny rows of column 'i' in 'r'.
            if np.sum(~np.isnan(r[tmp_nx:(tmp_nx + tmp_ny),i])) == 0 :
                w[i] = np.nan
                wvar[i] = np.nan
            # If it is equal to 0, set 'w' and 'wvar' to NaN.

    min_n =np.min(np.c_[nx,ny], axis=1)
        # Calculate the minimum count of non-NaN values between 'nx' and 'ny'.
    
    # Z-Transformation
    wmean = np.multiply(min_n, nx + ny + 1) / 2
        # Calculates the expected mean of the rank-sum statistic under the null hypothesis.
    wc = w - wmean
        # Centers the rank-sum statistic by subtracting the expected mean.
    wz = np.divide(wc - 0.5 * np.sign(wc), np.sqrt(wvar))
        # Standardizes the centered rank-sum statistic using the Z-transformation.

    wz[np.isnan(wz)] = 0
    wz[np.isinf(wz)] = 0
    # set any NaN (Not-a-Number) or infinite values in the Z-transformed rank-sum statistic wz to 0
    t[np.isnan(t)] = 0
    t[np.isinf(t)] = 0
    # set any NaN or infinite values in the t-statistic t to 0

    return t, w, mdiff, wz

# t - array of t-statistics one for each row between treatment group (x) and control group (y). A higher absolute value of t indicates a larger difference between the groups
# w - array of rank-sum statistics between treatment group (x) and control group (y). Quantifies difference by ranks of values within group. A higher absolute value w indicates a larger difference
# mdiff -  array of median difference. Each element represents the difference in medians between the treatment and control groups for each gene. 
# wz - array of Z-transformed rank-sum statistics. Standardize the rank sum to have mean of 0 and standard deviation of 1 under the null hypothesis. This transformation makes it easier to compare the significance of the rank-sum statistics across different genes
InĀ [66]:
def permutation_test(X, treat_num, control_num, Iter_num):
    # X: row - genes (a data matrix with genes in rows and samples in columns)
    # column - samples
    # treat_num (the number of samples in the treatment group)
    # control_num (the number of samples in the control group)
    # Iter_num (the number of permutations to perform)

    X = np.array(X) # Convert genes 
    [row_num, col_num] = X.shape # Determines the number of rows and columns in the data matrix X and stores these values in row_num and col_num.

    try:
        if col_num < (treat_num + control_num):
            # checks if the total number of samples (col_num) is less than the sum of the treatment and control group sizes (treat_num and control_num).
            raise Exception('Total sample number exceeds data matrix column size!')
    except Exception as e:
        print('InputError: ',e)

    # Initialize variables
    t = []  # Initializes an array 't' to store t-statistics for each gene and permutation.
    w = []  # Initializes an array 'w' to store rank-sum statistics for each gene and permutation.
    m = []  # Initializes an array 'm' to store median differences for each gene and permutation.
    wz = [] # Initializes an array 'wz' to store Z-transformed rank-sum statistics for each gene and permutation.
            # Place to store the results of permuation test.

    # Initialize tqdm for progress tracking
    with tqdm(total=Iter_num) as pbar:
        for i in range(Iter_num):
            col_order = np.random.permutation(col_num) # Generates a random permutation of column indices.
            xp = X[:,col_order[0:treat_num]] # Selects the first 'treat_num' columns as the treatment group.
            yp = X[:,col_order[treat_num:treat_num + control_num]] # Selects the next 'control_num' columns as the control group.
            # Calculates various statistics using the 'stat_uneq' function for the shuffled data.
            t_i, w_i, m_i, wz_i = stat_uneq(xp, yp)
            # Append the statistics for this permutation to the respective lists
            t.append(t_i)
            w.append(w_i)
            m.append(m_i)
            wz.append(wz_i)

            # Update tqdm progress
            pbar.update()
    
    # Convert the lists of statistics to NumPy arrays
    t = np.array(t)
    w = np.array(w)
    m = np.array(m)
    wz = np.array(wz)
    
    return t, w, m, wz # Returns arrays containing the results of the permutation test for each gene.
InĀ [67]:
# Define a function realcomp that takes two inputs: x (data) and opt (transformation option).
def realcomp(x,opt):
    # Get the dimensions of the input data matrix 'x'.
    [n, k] = x.shape

    # Check the transformation option 'opt' to determine which transformation to apply.
    if opt == 1:  # Chi-squared transformation
        # Apply the chi-squared transformation to 'x'.
        z = -2 * np.log(x)
        # Count the non-NaN values in each row of the transformed 'z'.
        k = np.sum(~np.isnan(z), axis=1)
        # Calculate the sum of the non-NaN values in each row of 'z'.
        z = np.nansum(z, axis=1)
        # Calculate 'y' values using the chi-squared cumulative distribution function.
        y = 1 - chi2.cdf(z, 2 * k)
    
    elif opt == 2:  # Logit transformation
        # Apply the logit transformation to 'x'.
        z = np.log(np.divide(x, 1 - x))
        # Count the non-NaN values in each row of the transformed 'z'.
        k = np.sum(~np.isnan(z), axis=1)
        # Perform additional calculations on 'z'.
        z = np.multiply(np.nansum(z, axis=1), np.divide(-np.sqrt(15 * k + 12), np.multiply((5 * k + 2), k * np.power(np.pi, 2))))
        # Calculate 'y' values using the t-distribution cumulative distribution function.
        y = 1 - t.cdf(z, 5 * k + 4)
    
    elif opt == 3:  # Standard Normal (Z-Score) Transformation
        # Apply the standard normal transformation to 'x'.
        z = -norm.ppf(x)
        # Count the non-NaN values in each row of the transformed 'z'.
        k = np.sum(~np.isnan(z), axis=1)
        # Perform additional calculations on 'z'.
        z = np.divide(np.nansum(z, axis=1), np.sqrt(k))
        # Calculate 'y' values using the standard normal cumulative distribution function.
        y = 1 - norm.cdf(z)
    
    # Return the computed final trasnformed data 'y' values and the intermediate transformed data 'z', depending on the option.
    return y, z
InĀ [68]:
def vidz(d, opt, idx):
    # Check the number of columns in the input data
    if d.shape[1] > 2:
        # If more than 2 columns, apply transformation using the specified option
        px, _ = np.array(realcomp(d[:,:-1], opt))
    else:
        # If 2 or fewer columns, take the first column as px
        px = d[:,0]

    # Plot the transformed data against the last column of the input data
    plt.plot(px.reshape(px.size, 1), d[:,-1].reshape(d[:,-1].size, 1), 'g.')

    # Mark specific indices on the plot with magenta circles
    plt.plot(px[idx].reshape(px[idx].size, 1), d[idx,-1].reshape(d[idx,-1].size, 1), 'mo')
InĀ [69]:
def nwpv(d,opt):
    # Set a small value for epsilon
    eps = np.finfo(d.dtype).eps

    # Get the dimensions of the input data
    [n, k] = d.shape

    # Iterate over each column in the input data
    for i in range(k):
        # Replace zeros in the column with small values
        tn = np.array(np.where(d[:,i] == 0))
        if np.array(tn).size != 0:
            d[np.where(d[:,i] == 0),i] = np.linspace(1.e-10, eps, len(tn)).T

        # Replace very small values with epsilon
        if np.array(np.where(d[:,i] < eps)).size != 0:
            d[np.where(d[:,i] < eps),i] = eps

        # Replace values close to 1 with a value slightly less than 1
        if np.array(np.where(d[:,i] > 0.99999999999999994)).size != 0:
            d[np.where(d[:,i] > 0.99999999999999994),i] = 0.99999999999999994

    # Apply a transformation to the modified data using the specified option
    [y, z] = realcomp(d, opt)

    # Find indices where the transformed values are less than 0.05
    idx = np.where(y < 0.05)

    # Plot the modified data and mark specific indices
    vidz(d, opt, idx)

    # Return the transformed values and the marked indices
    return y, idx
InĀ [70]:
def pval2tail(s0, s):
    # p-value computation routine
    # s0: null statistic, s: observed statistic
    # p-value: computed by two-tail test

    # Stack the null statistics s0 into a 1D array
    stacked_s0 = np.hstack(s0)

    # Compute the empirical cumulative distribution function (ECDF) for the null statistics
    ecdf_res = ECDF(stacked_s0)

    # Extract unique values (excluding the first one) and their corresponding cumulative probabilities
    s0 = ecdf_res.x[1:]
    f0 = ecdf_res.y[1:]

    # Create a function that interpolates the ECDF values
    f = interp1d(s0, f0, bounds_error=False)

    # Compute the p-value by evaluating the interpolated function at the observed statistic s
    p = f(s)

    # Replace any NaN values in p with 0
    p[np.isnan(p)] = 0

    # Calculate the two-tailed p-value by taking the minimum of p and 1-p and multiplying by 2
    p = 2 * np.min(np.c_[p, 1 - p], axis=1)

    # Handle special cases when p-values are 0 or 1
    if np.sum(p != 0) != 0:
        # If there are non-zero p-values, replace zeros with the smallest non-zero value divided by 2
        p[p == 0] = np.nanmin(p[p != 0]) / 2
    else:
        # If all p-values are zero, replace them with a small non-zero value
        p[p == 0] = 1e-10

    # Adjust p-values that are exactly 1 to ensure they are within [0, 1]
    p[p == 1] = (1 - np.max(p[p != 1])) / 2 + np.max(p[p != 1])

    return p
InĀ [71]:
def pval_calc(trnd, wrnd, mrnd, wzrnd, x, y, pscale, pcomb):
    # Input arguments
    # x: treatment data (m by nx)
    # y: control data (m by ny)
    # pscale: type=bool, if True, scale p-value using Liptak-Stouffer's Z method
    # pcomb: p-value combination option
    #        1 - pt & pm
    #        2 - pwz & pm
    #        3 - pt & pwz & pm

    # Calculate various statistics for treatment and control data
    [tobs, wobs, mobs, wzobs] = stat_uneq(x, y)

    # Calculate one-tailed p-values for different statistics
    pt = pval2tail(trnd, tobs)  # p-value for t-statistic
    pw = pval2tail(wrnd, wobs)  # p-value for ranksum statistic
    pm = pval2tail(mrnd, mobs)   # p-value for median difference
    pwz = pval2tail(wzrnd, wzobs)  # p-value for Z-transformed ranksum statistic

    # Combine p-values based on pcomb option
    if pcomb == 1:
        pall = np.c_[pt, pm]  # Combine t-statistic and median difference p-values
    elif pcomb == 2:
        pall = np.c_[pwz, pm]  # Combine Z-transformed ranksum and median difference p-values
    elif pcomb == 3:
        pall = np.c_[pt, pwz, pm]  # Combine t-statistic, Z-transformed ranksum, and median difference p-values

    # Scale p-values using Liptak-Stouffer's Z method if pscale is True
    if pscale:
        pall = 1 - norm.cdf(Zscore(-norm.ppf(pall)))

    # Calculate overall p-value using non-parametric weighted p-value combination
    ovp = np.array(nwpv(pall, 3)[0])

    return pt, pw, pm, pwz, ovp
# pt represents the one-tailed p-values for the t-statistic. It measures significanace difference between treatment vs control groups, based on means.
# pw is the one-tailed p-value associated with the Wilcoxon rank-sum statistic. This statistic assesses the difference in location (median) between the treatment vs control groups.
# pm is the one-tailed p-value for the median difference between the treatment and control groups. It measures the significance of differences in medians.
# pwz represents the one-tailed p-values for the Z-transformed ranksum statistic. The Z-transform standardizes the ranksum statistic, making it easier to compare significance across genes.

Statistical dataframe¶

InĀ [72]:
"""
Unlike original database (Not removing any fault columns - Less than ~ 7,000 genes),
removing all the uncessary columns for stat analysis (except mCherry+/BFP- etc)
"""

# Read the table from XLSX
data_rc = pd.read_excel(f"{merged_xlsx_path}")

# Cleaning out dAAVS1 and pDest from the dataframe 
exclude_keywords = ["dAAVS1", "pDest"]
data_nor = data_rc[~data_rc.apply(lambda row: any(keyword in str(row) for keyword in exclude_keywords), axis=1)]

# Drop the columns that have less than half of maximum genes
# exclude_columns = ["63-mCherryPositive&BFPNegative", "64-mCherryNegative&BFPNegative"]
Less_than_half_of_max = gene_counts[gene_counts < (max_gene_count /2) ].index
Less_than_half_of_max = list(Less_than_half_of_max)
Less_columns = Less_than_half_of_max
data_nor = data_nor.drop(Less_columns, axis=1)

# Changing ORF_ID columns to number and sort the dataframe
data_nor["ORF_ID"] = pd.to_numeric(data_nor["ORF_ID"], errors="coerce")
data_nor = data_nor.sort_values("ORF_ID").reset_index(drop=True)

# Select data based on header values
columns_to_include = [column for column in data_nor.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
data_col = data_nor[columns_to_include].astype(float)

# Add length for length normalization
gene_info = data_nor[["Gene_Symbol", "Length"]]
gene_merge = gene_info.merge(data_col, left_index=True, right_index=True)

# Set dataframes for gene expression normalziation 
data_edgeR = data_col
data_bioinfokit = gene_merge

# norm function associated with scipy.
from bioinfokit.analys import norm

# Define the normalization method you want to use: 'CPM', 'TMM', 'GeTMM' or 'TPM', 'RPKM'
chosen_normalization = 'TMM'  # Change this to your desired normalization method

if chosen_normalization == 'CPM':
    # Load your data and replace NaN with 0
    data_cpm = data_edgeR.fillna(0)

    # Convert pandas DataFrame to R data frame
    data_raw_r = df_to_r_dataframe(data_cpm)

    # Calculate CPM in R
    norm_raw = edgeR.cpm(data_raw_r)
    norm_log = edgeR.cpm(data_raw_r, log=True)

    # Access column names
    norm_colnames = list(data_raw_r.colnames)

elif chosen_normalization in ['GeTMM', 'TMM']:
    if chosen_normalization == 'GeTMM':
        # Load your data and replace NaN with 0 and drop Gene_Symbol
        data_norm = data_bioinfokit.fillna(0)
        data_norm = data_norm.drop(['Gene_Symbol'], axis=1)
        
        # Assuming 'data_norm' is your DataFrame
        data_norm['Length'] = data_norm['Length'] / 10**3
        data_norm.iloc[:, 1:] = data_norm.iloc[:, 1:].div(data_norm['Length'], axis=0)
        
        # Select data based on header values
        columns_to_include = [col for col in data_bioinfokit.columns if any(any(sample in col for sample in samples) for samples in sample_key.values())]
        data_norm = data_norm[columns_to_include]
    
    elif chosen_normalization == 'TMM':
        data_norm = data_edgeR.fillna(0)
    
    # Set the GeTMM/TMM handle
    getmm_tmm_handle = "Control"  # Change this to your desired normalization factors
    
    # Perform normalization and get results
    dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r = edgeR_normfactor(data_norm, getmm_tmm_handle)

elif chosen_normalization in ['RPKM', 'TPM']:
    if chosen_normalization == 'RPKM':
        nm_method = 'rpkm'
    elif chosen_normalization == 'TPM':
        nm_method = 'tpm'
    
    data_norm = data_bioinfokit.fillna(0)

    # Convert 'Length' column to numeric (if it's not already)
    data_norm['Length'] = pd.to_numeric(data_norm['Length'], errors='coerce')

    # Make 'Gene_Symbol' column as the index column
    data_norm.set_index('Gene_Symbol', inplace=True)
    
    nm = norm()
    getattr(nm, nm_method)(df=data_norm, gl='Length')

    # Get the normalized DataFrame
    nor_df = getattr(nm, f'{nm_method}_norm')

    # Reset index back to default integer index
    nor_df.reset_index(drop=True, inplace=True)
    nor_raw = nor_df

    # Calculate the logarithm of nor values (base 2, with a small constant added)
    avoid_nan = 0.18050946883  # Mimic edgeR cpm(log=True)
    nor_log = (np.log2(nor_raw + avoid_nan)).astype(float)

    # Convert nor_raw and nor_log DataFrame to array
    norm_raw = nor_raw.values
    norm_log = nor_log.values

    # Access column names
    norm_colnames = nor_raw.columns.tolist()

else:
    raise ValueError(f"Invalid normalization method: {chosen_normalization}. Choose 'CPM', 'TMM', 'GeTMM' or 'TPM', 'RPKM'.")

# cpm_raw to dataframe for storage
nor_raw_df = pd.DataFrame(data=np.where(norm_raw != 0, norm_raw, np.nan), columns=norm_colnames)
nor_log_df = pd.DataFrame(data=np.where(norm_log != 0, norm_log, np.nan), columns=norm_colnames)

# Saving the merged DataFrame
nor_raw_path = os.path.join(database_files_stats, f"{directory}_stats_nor_raw.xlsx")
nor_raw_compile_df = save_dataframe(fasta_data, nor_raw_df, nor_raw_path)

nor_log_path = os.path.join(database_files_stats, f"{directory}_stats_nor_log.xlsx")
nor_log_compile_df = save_dataframe(fasta_data, nor_log_df, nor_log_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_stats_nor_raw.xlsx
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_stats_nor_log.xlsx

Normalization factors ¶

InĀ [73]:
#Retrieving right data
columns_to_include = [column for column in nor_raw_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
norm_data = nor_raw_compile_df[columns_to_include]
norm_data = norm_data.dropna(how='all')
norm_data = norm_data.astype(float)
norm_data = np.log2(norm_data)

# Extract the column names from the pandas DataFrame
col_headers = norm_data.columns.tolist()

# Convert pandas DataFrame to R data frame
nor_clean_quant_batch_r = df_to_r_dataframe(norm_data)

# Initialize variables
batch_assignment = []
NGS_assignment = []
control_or_experimental = []
current_batch = 0

# Function to get the batch name based on the current batch number
def get_batch_name(batch_number):
    if batch_number == 0:
        return "BatchBaseline"
    else:
        return "Batch" + str(batch_number)
    
def determine_control_or_experimental(header):
    if "Baseline" in header :
        return "Baseline"
    elif "DMSO" in header :
        return "Control"
    elif "mCherryPositive&BFPNegative" in header or "mCherryNegative&BFPNegative"  in header:
        return "plasmid"
    else:
        return "Experiment"

# Assign batches based on the header pattern
for name in col_headers:
    control_or_experimental.append(determine_control_or_experimental(name))
    if "Baseline" in name:
        current_batch += 1
    batch_assignment.append(get_batch_name(current_batch))
    if current_batch <= 4:
        NGS_assignment.append("NGS1")
    elif current_batch == 5:
        NGS_assignment.append("NGS2")

# Convert the batch_assignment list to an R vector
batch = robjects.vectors.StrVector(batch_assignment)
NGS = robjects.vectors.StrVector(NGS_assignment)
control_exp = robjects.vectors.StrVector(control_or_experimental)

# R vector to R factor 
batch = robjects.r.factor(batch)
NGS = robjects.r.factor(NGS)
control_exp = robjects.r.factor(control_exp)

Noise detection ¶

InĀ [74]:
stats = importr("stats")

# Histogram of log2 to determine proper cutoff
median_log2_nor = base.apply(norm_log, 1, stats.median)
graphics.hist(median_log2_nor)
expr_cutoff = -1.0
graphics.abline(v=expr_cutoff, col="red", lwd=3)
expr_count = base.sum(FloatVector(np.array(median_log2_nor) > expr_cutoff))[0]

# Plot histogram using Python
plt.figure(figsize=(18, 9))  # Adjust the figure size as desired

# Calculate the bin width and adjust the bar width and spacing
num_bins = 50
data_range = np.ptp(median_log2_nor)
bin_width = data_range / num_bins
bar_width = 3.0 * bin_width
bar_spacing = bin_width - bar_width

# Plot the histogram with adjusted bar width and spacing
plt.hist(median_log2_nor, bins=num_bins, range=(np.min(median_log2_nor), np.max(median_log2_nor)),
         color='black', edgecolor='black', linewidth=0.5,
         rwidth=bar_width, align='mid')

# Set the background color and gridlines
plt.gca().set_facecolor('lightgray')
plt.grid(color='white', linestyle='-', linewidth=0.5)

plt.axvline(x=expr_cutoff, color='red', linewidth=3)
plt.title('Histogram of log2 nor', fontsize=16)
plt.xlabel('Log2 nor', fontsize=16)
plt.ylabel('No of Genes in log2', fontsize=16)
plt.yscale('log')
plt.tick_params(axis='both', labelsize=12) 

print("Total number of genes after the cutoff:", int(expr_count))

# Define the nor Histogram graph file path for graph storing
nor_histogram_path = os.path.join(graphs_files_stats, f"{directory}_stats_norm_histogram.svg")
plt.savefig(nor_histogram_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_stats_norm_histogram.svg saved to {nor_histogram_path}")

# Showing the figure
# plt.close()
plt.show()
Total number of genes after the cutoff: 14506
RQ023682_stats_norm_histogram.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_norm_histogram.svg
No description has been provided for this image

Noise removal from dataframe ¶

InĀ [75]:
# Convert median_log2_nor to a numpy array
median_log2_nor_np = np.array(median_log2_nor)

# Create a boolean mask based on the expression cutoff
mask = median_log2_nor_np > expr_cutoff
unmask = median_log2_nor_np <= expr_cutoff

# Get the indices where the mask is True
indices = np.where(mask)[0]
non_indices = np.where(unmask)[0]

# Subset nor_raw based on the indices
nor_clean = norm_raw[indices, :]
# Subset nor_raw based on not in the indices
nor_unclean = norm_raw[non_indices, :]

# Access column names
if chosen_normalization in ['RPKM', 'TPM']:
    nor_colnames = nor_raw.columns.tolist()
elif chosen_normalization  in ['CPM', 'TMM', 'GeTMM']:
    nor_colnames = list(data_raw_r.colnames)

# nor_raw to dataframe for storage
nor_clean_df = pd.DataFrame(data=np.where(nor_clean != 0, nor_clean, np.nan), index=indices, columns=nor_colnames)
nor_unclean_df = pd.DataFrame(data=np.where(nor_unclean != 0, nor_unclean, np.nan), index=non_indices, columns=nor_colnames)

# Saving the merged DataFrame
nor_clean_path = os.path.join(database_files_stats, f"{directory}_stats_nor_clean.xlsx")
nor_clean_compile_df = save_dataframe(fasta_data, nor_clean_df, nor_clean_path)

# Saving the merged Dataframe
nor_unclean_path = os.path.join(database_files_stats, f"{directory}_stats_nor_unclean.xlsx")
nor_unclean_compile_df = save_dataframe(fasta_data, nor_unclean_df, nor_unclean_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_stats_nor_clean.xlsx
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_stats_nor_unclean.xlsx

Correlation matrix clustering (Pre-normalization) ¶

InĀ [76]:
# Get the column names from data_raw
col_names = nor_clean_df.columns.values.tolist()

# Calculate the correlation matrix in R
cor_matrix = stats.cor(nor_clean, use="everything", method ="pearson")

# Convert the correlation matrix to a NumPy array
cor_matrix_np = np.asarray(cor_matrix)

# Compute the condensed distance matrix
dist_matrix = pdist(cor_matrix_np)

# Perform hierarchical clustering with the condensed distance matrix
linkage_matrix = hierarchy.linkage(dist_matrix, method='average')
dendrogram_row = hierarchy.dendrogram(linkage_matrix, no_plot=True)

# Get the order of rows and columns from the dendrogram
order_row = dendrogram_row['leaves']
order_col = dendrogram_row['leaves']

# Reorder the correlation matrix based on the clustering
cor_matrix_ordered = cor_matrix_np[order_row][:, order_col]

# Perform hierarchical clusterisng and plot the heatmap with clustering
sns.set(font_scale=0.7)
g = sns.clustermap(cor_matrix_np, cmap='coolwarm', cbar_pos=(1, 0.2, 0.03, 0.5), cbar_kws={'label': 'Correlation'},
                   dendrogram_ratio=(0.1, 0.1), linewidths=0.5,
                   xticklabels=col_names, yticklabels=col_names)

# Rotate the x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90)

# Set the title above the clustering
g.ax_heatmap.set_title('Heatmap of Correlation Matrix with Clustering',  fontsize=16, pad=20, loc='center', y=1.15)

# Define the nor Histogram graph file path for graph storing
PCC_Heatmap_path = os.path.join(graphs_files_stats, f"{directory}_stats_PCC_Heatmap.svg")
plt.savefig(PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=300)
print(f"{directory}_stats_PCC_Heatmap.svg saved to {PCC_Heatmap_path}")

# Showing the figure
plt.close()
# plt.show()
RQ023682_stats_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_PCC_Heatmap.svg

Box & Violin Plot (Pre-normalization) ¶

InĀ [77]:
# Removing nan values
nor_clean_compile_df.replace(['nan'], np.nan, inplace=True)

# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = nor_clean_compile_df[columns_to_include].astype(float)
sample_data = sample_data.dropna(how='all')
sample_data = sample_data.astype(float)

# Get the gene symbols corresponding to the data points
gene_symbols = nor_clean_compile_df.loc[sample_data.index, 'Gene_Symbol']

# Combine the sample data and gene symbols into a single DataFrame
data_melted = pd.concat([sample_data, gene_symbols], axis=1)

# Extract the sample names from the column headers
sample_names = sample_data.columns.str.split('_', expand=True).get_level_values(0)

# Melt the data for plotting
data_melted = data_melted.melt(id_vars=['Gene_Symbol'], value_vars=sample_names, var_name='Samples', value_name='Expression Level')

# Calculate the 10th and 90th percentile of the expression level for each sample
lower_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.1)
upper_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.9)

# Calculate the number of unique samples
num_unique_samples = len(data_melted['Samples'].unique())

# Define the sample names
include_baselline = name_list.copy()
new_value = ["Baseline", "mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"]
merged_samples_list = include_baselline + new_value
col_sample = [column for column in nor_clean_compile_df.columns if any(sample in column for sample in merged_samples_list)]

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample) - len(overlap_samples)
color_palette = sns.color_palette("Set1", n_colors=num_colors)  # Or any other color palette you prefer

# Assign colors to the remaining samples in col_sample
col_colors = dict(zip(col_sample, color_palette))
sample_color_dict.update(col_colors)

# Calculate the figsize dynamically based on the number of unique samples
fig_width = min(12, 1.5 + num_unique_samples * 1)
fig_height = min(12, 1.5 + num_unique_samples * 0.5)

# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(fig_width, fig_height))

# Plot the box plot in the first subplot
sns.boxplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[0], palette=sample_color_dict)
axs[0].set_xscale('log')
axs[0].set_title('Whisker Box Plot normal data')
axs[0].set_xlabel('Log Expression Level')
axs[0].set_ylabel('Samples')

# Plot the violin plot in the second subplot
sns.violinplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[1], inner='quart', 
               palette=sample_color_dict
               )
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)]) # Use quantiles as x-axis labels
axs[1].set_xscale('log')
axs[1].set_title('Violin Plot normal data')
axs[1].set_xlabel('Quantile Log Expression Level')
axs[1].set_ylabel('Samples')

# Define the nor Histogram graph file path for graph storing
Pre_Box_Violin_Plot_path = os.path.join(graphs_files_stats, f"{directory}_stats_Pre_Box_Violin_Plot.svg")
plt.savefig(Pre_Box_Violin_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_stats_Pre_Box_Violin_Plot.svg saved to {Pre_Box_Violin_Plot_path}")

# Show the plot
plt.tight_layout()
plt.close()
# plt.show()
/tmp/ipykernel_1548459/1344428164.py:69: UserWarning:

set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.

RQ023682_stats_Pre_Box_Violin_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_Pre_Box_Violin_Plot.svg

Density Plot (Pre-normalization) ¶

InĀ [78]:
# Preparation of additional dataframe for density plot
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
Pre_norm_density = nor_clean_compile_df[columns_to_include]
Pre_norm_density = np.log2(Pre_norm_density.dropna(how='all').astype(float))
col_renames = [f'{batch}_{NGS}_{control_exp}' for batch, NGS, control_exp in zip(batch_assignment, NGS_assignment, control_or_experimental)]
Pre_norm_density.columns = col_renames
Pre_norm_renamed = Pre_norm_density

# Create a list of unique batches, NGS, and control/experiment groups
unique_batches = sorted(set(batch_assignment))
unique_NGS = sorted(set(NGS_assignment))
unique_control_exp = sorted(set(control_or_experimental))

# Set up the plot grid
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10, 6))

# Overlay density plots for batches
for i, batch in enumerate(unique_batches):
    batch_columns = [col for col in Pre_norm_renamed.columns if batch in col]
    batch_data = Pre_norm_renamed[batch_columns].stack().reset_index(drop=True)
    sns.kdeplot(data=batch_data, fill=True, ax=axes[0], label=f'{batch}')
axes[0].set_title('Overlayed Density Plots for Batches')
axes[0].set_xlabel('Gene Expression Level')
axes[0].legend()

# Overlay density plots for NGS
for i, NGS in enumerate(unique_NGS):
    NGS_columns = [col for col in Pre_norm_renamed.columns if NGS in col]
    NGS_data = Pre_norm_renamed[NGS_columns].stack().reset_index(drop=True)
    sns.kdeplot(data=NGS_data, fill=True, ax=axes[1], label=f'{NGS}')
axes[1].set_title('Overlayed Density Plots for NGS')
axes[1].set_xlabel('Gene Expression Level')
axes[1].legend()

# Overlay density plots for control/experiment
for i, control_exp in enumerate(unique_control_exp):
    control_exp_columns = [col for col in Pre_norm_renamed.columns if control_exp in col]
    control_exp_data = Pre_norm_renamed[control_exp_columns].stack().reset_index(drop=True)
    sns.kdeplot(data=control_exp_data, fill=True, ax=axes[2], label=f'{control_exp}')
axes[2].set_title('Overlayed Density Plots for Control/Experiment')
axes[2].set_xlabel('Gene Expression Level')
axes[2].legend()

# Define the Density plot file path for graph storing
Pre_Density_Plot_path = os.path.join(graphs_files_stats, f"{directory}_stats_Pre_density_Plot.svg")
plt.savefig(Pre_Density_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_stats_Pre_density_Plot.svg saved to {Pre_Density_Plot_path}")

# Showing the figure
plt.tight_layout()
# plt.show()
plt.close()
RQ023682_stats_Pre_density_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_Pre_density_Plot.svg

Winsorization (DescTools) ¶

InĀ [79]:
nor_clean_compile_df
Out[79]:
ORF_ID NCBI Group Gene_Symbol GC_Content 1-Baseline-batch1 2-DMSO-A1 3-DMSO-B1 4-DMSO-C1 5-Paclitaxel-A ... 62-Vinblastine-C 63-mCherryPositive&BFPNegative 64-mCherryNegative&BFPNegative 68-Baseline-batch5 69-DMSO-A 70-DMSO-B 71-DMSO-C 72-TAS102-A 73-TAS102-B 74-TAS102-C
0.0 1.0 805.0 G06 CALM2 39.111111 191.267044 150.099265 144.572297 140.791295 129.090270 ... 246.151411 184.665331 69.709985 156.997897 80.086365 94.963157 135.269365 115.335034 102.790008 68.967454
1.0 2.0 2629.0 G02 GBA 55.245189 21.923890 21.167845 17.460422 17.189635 33.467848 ... 9.567723 22.159840 28.916734 16.204521 18.925308 21.168870 17.780832 56.132640 28.393814 49.123045
2.0 3.0 10282.0 G03 BET1 38.375350 124.739377 141.118967 97.778365 144.884065 148.214754 ... 192.224247 155.939613 14.974737 96.961476 96.242116 82.499243 49.966388 120.597469 113.355148 69.292772
4.0 6.0 7178.0 G02 TPT1 44.123314 78.623607 155.872313 114.540370 84.311066 69.667765 ... 113.073086 115.723607 19.622070 60.036421 57.006721 75.377006 49.741314 71.700677 38.518740 35.459682
5.0 7.0 8089.0 G01 YEATS4 36.111111 100.547498 129.572870 114.540370 106.412025 159.143031 ... 58.276129 120.648016 137.870859 134.152179 101.781231 106.042192 105.784694 120.816738 61.629983 91.089090
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18383.0 100080862.0 653427.0 delta FOXD4L5 61.280000 18.899906 35.921192 24.444591 28.649391 31.418796 ... 4.348965 18.056166 32.531326 11.688507 17.540530 27.895427 37.587328 18.857059 8.804283 10.735500
18384.0 100080864.0 389058.0 delta SP5 57.239627 31.751841 28.865243 27.936676 45.020472 53.275349 ... 20.875031 38.574536 36.662288 26.830436 29.541945 27.104068 24.983194 37.494850 52.385486 33.507773
18385.0 100080865.0 642623.0 delta UBTFL1 63.686636 34.775826 12.828997 2.793668 19.645297 8.879225 ... 17.395859 41.857475 17.556589 69.865392 56.775925 28.686787 53.342495 7.235848 20.249852 27.652045
18386.0 100080869.0 100131980.0 delta ZNF705G 65.692308 22.679887 15.394796 32.825594 20.463851 15.026381 ... 40.880270 54.168497 6.712813 25.502196 20.079290 22.553750 7.652510 32.013146 24.431886 25.374818
18387.0 100080871.0 7617.0 delta ZNF66 62.395076 1.511992 NaN NaN 0.818554 1.366035 ... NaN 0.820735 5.163703 4.516014 2.769557 2.176239 2.025664 1.096341 5.062463 1.301273

14506 rows Ɨ 72 columns

InĀ [80]:
# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = nor_clean_compile_df[columns_to_include].astype(float)

#Set the quantile values for winsorization 
lower_quantile = 0.00001
upper_quantile = 0.99999

# Turn pandas dataframe to R dataframe
winsorize_r = df_to_r_dataframe(sample_data)

# Run DescTools winsorization 
winsorize_df = winsorize_func(winsorize_r, lower_quantile, upper_quantile)

# Save the windorized database
windorize_path = os.path.join(database_files_stats, f"{directory}_windsorize.xlsx")
windorize_compile_df = save_dataframe(fasta_data, winsorize_df, windorize_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_windsorize.xlsx

Upper Quantile Normalization (edgeR) ¶

InĀ [81]:
# Upper quantile normalization function
def upper_quantile_normfactor(data, handle):
    experimental_columns = data.columns
    
    if handle == "Control":
        # Define your control and experimental keywords
        neg_control_keywords = ["DMSO", "Baseline"]
        none_keywords = ["mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"
                        #  , "Serumfree"
                         ]
        
        # Determine group names based on column names
        group_names = [
            "Control" if any(keyword in col for keyword in neg_control_keywords)
            else ("None" if any(keyword in col for keyword in none_keywords)
            else "Experimental")
            for col in experimental_columns
        ]
        # Determine group names based on column names
        group_names = [
            "Control" if any(keyword in col for keyword in neg_control_keywords)
            else ("Experimental")
            for col in experimental_columns
        ]
    elif handle == "Triplet":
        # Extract the second part of each column name after splitting by underscore
        extracted_keywords = [col.split('-')[1] for col in experimental_columns]
        group_names = extracted_keywords
        
    group_factor = robjects.FactorVector(group_names)

    data_raw_r = df_to_r_dataframe(data)
    
    dge = edgeR.DGEList(counts=data_raw_r, group=group_factor)
    dge = edgeR.calcNormFactors(dge, method="upperquartile", p=0.75)
    dge_normfactors_r = dge.rx2('samples')
    
    with (robjects.default_converter + pandas2ri.converter).context():
        dge_normfactors_df = robjects.conversion.get_conversion().rpy2py(dge_normfactors_r)
    
    norm_raw = edgeR.cpm(dge)
    norm_log = edgeR.cpm(dge, log=True)
    
    norm_colnames = list(data_raw_r.colnames)
    
    return dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r

# Fill in the NaN to 0
winsorize_data = winsorize_df.fillna(0)

# Set the handle
upperquant_handle = "Control" 

# Perform normalization and get results
dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r = upper_quantile_normfactor(winsorize_data, upperquant_handle)

# Return 0 values to NaN
nor_raw_df = pd.DataFrame(data=np.where(norm_raw != 0, norm_raw, np.nan), columns=norm_colnames)
# Set database to a dedicated index
nor_raw_df  = nor_raw_df.set_index(winsorize_df.index)
# Set values to log2
nor_raw_df = np.log2(nor_raw_df)

# Save Upper quantile normalized database
upper_quant_compile_path = os.path.join(database_files_stats, f"{directory}_upper_quant.xlsx")
upper_quant_compile_df = save_dataframe(fasta_data, nor_raw_df, upper_quant_compile_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_upper_quant.xlsx

Batch correction (Limma) ¶

InĀ [82]:
# Preparation of additional dataframe for batch correction
columns_to_include = [column for column in nor_raw_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_data = nor_raw_df[columns_to_include]
batch_data = batch_data.dropna(how='all').astype(float)

# Convert the batch_assignment list to an R vector
batch = robjects.vectors.StrVector(batch_assignment)
NGS = robjects.vectors.StrVector(NGS_assignment)
control_exp = robjects.vectors.StrVector(control_or_experimental)

# R vector to R factor 
batch = robjects.r.factor(batch)
NGS = robjects.r.factor(NGS)
control_exp = robjects.r.factor(control_exp)

# Metadata build up
col_names = [f'{control_exp}_{NGS}' for  control_exp, NGS  in zip(control_or_experimental, NGS_assignment)]
split_col_names = [name.split('_') for name in col_names]

# Organizing into a DataFrame
metadata_df = pd.DataFrame(split_col_names, columns=['control_exp', 'NGS'], index=col_names)

# Convert pandas DataFrame to R data frame
metadata_r = df_to_r_dataframe(metadata_df)
batch_data_col_r = df_to_r_dataframe(batch_data)

col_names_r = robjects.vectors.StrVector(col_names)
robjects.r.assign('batch_data_col_r', batch_data_col_r)
robjects.r.assign('col_names', col_names_r)
robjects.r('colnames(batch_data_col_r) <- col_names')

# Assign row names to the DataFrame (assuming index is the original names)
robjects.r.assign('metadata_r', metadata_r)

# Create a model matrix using metadata_df in R
if len(set(batch_assignment)) > 1:
    model_matrix = robjects.r('model.matrix(~ 0 + control_exp + 0 + NGS, data = metadata_r)')
    batch_names = ['Baseline', 'Control', 'Experiment', 'Plasmid', 'NGS']
    model_matrix_df = pd.DataFrame(model_matrix, columns=batch_names)

    # Convert pandas DataFrame to R data frame
    model_matrix_r = df_to_r_dataframe(model_matrix_df)
    batch_data_r = df_to_r_dataframe(batch_data)
else:
    pass

# Check if batch is not empty
if len(set(batch_assignment)) > 1:
    # Perform batch correction
    batch_corrected_data = limma.removeBatchEffect(batch_data_r, batch=batch
                                                   ,design=model_matrix_r
                                                   )
else:
    # Skip batch correction
    batch_corrected_data = upper_quant_compile_df

# Change R DataFrame to pandas DataFrame
batch_stat_df = pd.DataFrame(batch_corrected_data, columns=col_headers)
batch_stat_df = batch_stat_df.set_index(batch_data.index)
batch_stat_df.replace(0, np.nan, inplace=True)

# Saving the merged DataFrame
batch_compile_path = os.path.join(database_files_stats, f"{directory}_batch_corrected.xlsx")
batch_compile_stat_df = save_dataframe(fasta_data, batch_stat_df, batch_compile_path)

#If "coeeficient not estimable:" shows. It means levels are perfectly colinear or have no variability within them.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/rpy2/robjects/pandas2ri.py:56: UserWarning:

DataFrame contains duplicated elements in the index, which will lead to loss of the row names in the resulting data.frame

Coefficients not estimable: batch4 
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_batch_corrected.xlsx

Correlation matrix clustering (Post-normalization) ¶

InĀ [83]:
# Stats variable reconfirmed (Overlap between R and Scipy)
stats = importr("stats")

# Select the columns containing the sample data
batch_compile_df_corr = batch_compile_stat_df.fillna(0)
columns_to_include = [column for column in batch_compile_stat_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_compile_df_corr = batch_compile_df_corr[columns_to_include].astype(float)

# Convert pandas DataFrame to R data frame
batch_compile_df_corr_r = df_to_r_dataframe(batch_compile_df_corr)

# Get the column names from data_raw
col_names = batch_compile_df_corr.columns.values.tolist()

# Calculate the correlation matrix in R
cor_matrix = stats.cor(batch_compile_df_corr_r, use="everything", method ="pearson")

# Convert the correlation matrix to a NumPy array
cor_matrix_np = np.asarray(cor_matrix)

# Compute the condensed distance matrix
dist_matrix = pdist(cor_matrix_np)

# Perform hierarchical clustering with the condensed distance matrix
linkage_matrix = hierarchy.linkage(dist_matrix, method='average')
dendrogram_row = hierarchy.dendrogram(linkage_matrix, no_plot=True)

# Get the order of rows and columns from the dendrogram
order_row = dendrogram_row['leaves']
order_col = dendrogram_row['leaves']

# Reorder the correlation matrix based on the clustering
cor_matrix_ordered = cor_matrix_np[order_row][:, order_col]

# Perform hierarchical clusterisng and plot the heatmap with clustering
sns.set(font_scale=0.7)
g = sns.clustermap(cor_matrix_np, cmap='coolwarm', cbar_pos=(1, 0.2, 0.03, 0.5), cbar_kws={'label': 'Correlation'},
                   dendrogram_ratio=(0.1, 0.1), linewidths=0.5,
                   xticklabels=col_names, yticklabels=col_names)

# Rotate the x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90)

# Set the title above the clustering
g.ax_heatmap.set_title('Heatmap of Correlation Matrix with Clustering',  fontsize=16, pad=20, loc='center', y=1.15)

# Define the CPM Histogram graph file path for graph storing
Post_PCC_Heatmap_path = os.path.join(graphs_files_stats, f"{directory}_stats_Post_PCC_Heatmap.svg")
plt.savefig(Post_PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=300)
print(f"{directory}_stats_Post_PCC_Heatmap.svg saved to {PCC_Heatmap_path}")

# Showing the figure
# plt.close()
plt.show()
RQ023682_stats_Post_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_PCC_Heatmap.svg
No description has been provided for this image

Box & Violin Plot (Post-normalization) ¶

InĀ [84]:
# Removing nan values
batch_compile_stat_df.replace(['nan'], np.nan, inplace=True)

# Select the columns containing the sample data
columns_to_include = [column for column in batch_compile_stat_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = batch_compile_stat_df[columns_to_include].astype(float)
sample_data = sample_data.dropna(how='all')
sample_data = sample_data.astype(float)

# Get the gene symbols corresponding to the data points
# gene_symbols = nor_clean_compile_df.iloc[sample_data.index, 'Gene_Symbol']

# Combine the sample data and gene symbols into a single DataFrame
data_melted = pd.concat([sample_data, gene_symbols], axis=1)

# Extract the sample names from the column headers
sample_names = sample_data.columns.str.split('_', expand=True).get_level_values(0)

# Melt the data for plotting
data_melted = data_melted.melt(id_vars=['Gene_Symbol'], value_vars=sample_names, var_name='Samples', value_name='Expression Level')

# Calculate the 10th and 90th percentile of the expression level for each sample
lower_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.1)
upper_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.9)

# Calculate the number of unique samples
num_unique_samples = len(data_melted['Samples'].unique())

# Define the sample names
include_baselline = name_list.copy()
new_value = ["Baseline", "mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"]
merged_samples_list = include_baselline + new_value
col_sample = [column for column in nor_clean_compile_df.columns if any(sample in column for sample in merged_samples_list)]

# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample if sample.split("-")[1] in drug_color_map]

# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
    sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]

# Generate a color palette with the desired number of colors
num_colors = len(col_sample) - len(overlap_samples)
color_palette = sns.color_palette("Set1", n_colors=num_colors)  # Or any other color palette you prefer

# Assign colors to the remaining samples in col_sample
col_colors = dict(zip(col_sample, color_palette))
sample_color_dict.update(col_colors)

# Calculate the figsize dynamically based on the number of unique samples
fig_width = min(12, 1.5 + num_unique_samples * 1)
fig_height = min(12, 1.5 + num_unique_samples * 0.5)

# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(fig_width, fig_height))

# Plot the box plot in the first subplot
sns.boxplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[0], palette=sample_color_dict)
# axs[0].set_xscale('log')
axs[0].set_title('Whisker Box Plot normal data')
axs[0].set_xlabel('Log Expression Level')
axs[0].set_ylabel('Samples')

# Plot the violin plot in the second subplot
sns.violinplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[1], inner='quart', 
               palette=sample_color_dict
               )
# axs[1].set_xscale('log')
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)]) # Use quantiles as x-axis labels
axs[1].set_title('Violin Plot normal data')
axs[1].set_xlabel('Quantile Log Expression Level')
axs[1].set_ylabel('Samples')

# Define the nor Histogram graph file path for graph storing
Pre_Box_Violin_Plot_path = os.path.join(graphs_files_stats, f"{directory}_stats_Pro_Box_Violin_Plot.svg")
plt.savefig(Pre_Box_Violin_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_stats_Pro_Box_Violin_Plot.svg saved to {Pre_Box_Violin_Plot_path}")

# Show the plot
plt.tight_layout()
plt.close()
# plt.show()
/tmp/ipykernel_1548459/1146168366.py:70: UserWarning:

set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.

RQ023682_stats_Pro_Box_Violin_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_Pro_Box_Violin_Plot.svg

Density Plot (Post-normalization) ¶

InĀ [85]:
# Preparation of additional dataframe for density plot
columns_to_include = [column for column in batch_compile_stat_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_data_pro_den = batch_compile_stat_df[columns_to_include]
batch_data_pro_den = batch_data_pro_den.dropna(how='all').astype(float)
col_renames = [f'{batch}_{NGS}_{control_exp}' for batch, NGS, control_exp in zip(batch_assignment, NGS_assignment, control_or_experimental)]
batch_data_pro_den.columns = col_renames
batch_data_renamed = batch_data_pro_den

# Create a list of unique batches, NGS, and control/experiment groups
unique_batches = sorted(set(batch_assignment))
unique_NGS = sorted(set(NGS_assignment))
unique_control_exp = sorted(set(control_or_experimental))

# Set up the plot grid
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10, 6))

# Overlay density plots for batches
for i, batch in enumerate(unique_batches):
    batch_columns = [col for col in batch_data_renamed.columns if batch in col]
    batch_data = batch_data_renamed[batch_columns].stack().reset_index(drop=True)
    sns.kdeplot(data=batch_data, fill=True, ax=axes[0], label=f'{batch}')
axes[0].set_title('Overlayed Density Plots for Batches')
axes[0].set_xlabel('Gene Expression Level')
axes[0].legend()

# Overlay density plots for NGS
for i, NGS in enumerate(unique_NGS):
    NGS_columns = [col for col in batch_data_renamed.columns if NGS in col]
    NGS_data = batch_data_renamed[NGS_columns].stack().reset_index(drop=True)
    sns.kdeplot(data=NGS_data, fill=True, ax=axes[1], label=f'{NGS}')
axes[1].set_title('Overlayed Density Plots for NGS')
axes[1].set_xlabel('Gene Expression Level')
axes[1].legend()

# Overlay density plots for control/experiment
for i, control_exp in enumerate(unique_control_exp):
    control_exp_columns = [col for col in batch_data_renamed.columns if control_exp in col]
    control_exp_data = batch_data_renamed[control_exp_columns].stack().reset_index(drop=True)
    sns.kdeplot(data=control_exp_data, fill=True, ax=axes[2], label=f'{control_exp}')
axes[2].set_title('Overlayed Density Plots for Control/Experiment')
axes[2].set_xlabel('Gene Expression Level')
axes[2].legend()

# Define the Density plot file path for graph storing
Pro_Density_Plot_path = os.path.join(graphs_files_stats, f"{directory}_stats_Pro_density_Plot.svg")
plt.savefig(Pre_Density_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_Pre_density_Plot.svg saved to {Pre_Density_Plot_path}")

# Showing the figure
plt.tight_layout()
# plt.show()
plt.close()
RQ023682_Pre_density_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_Pre_density_Plot.svg

Venn Diagram (Verified Gene Sets) ¶

InĀ [86]:
Total_ORFs = pd.to_numeric(data_rc['NCBI'], errors='coerce').dropna().sort_values()
clean_ORFs = pd.to_numeric(batch_compile_stat_df['NCBI'], errors='coerce').dropna()

Total_Sets = set(Total_ORFs)
Clean_Sets = set(clean_ORFs)

intersection = Clean_Sets.intersection(Total_Sets)
only_Total_genes = Total_Sets - intersection  
only_Clean_genes = Clean_Sets - Total_Sets

fig, ax = plt.subplots(figsize=(16, 12))  # Adjust figure size to provide space for labels
venn = venn2(subsets=(len(only_Total_genes), len(only_Clean_genes), len(intersection)),
             set_labels=('Total ORF', 'Clean ORF'),  # <<< --- THIS HIDES THE 'A' AND 'B' LABELS
            #  set_colors=("#BABDFF", '#CEFFBA'),
             ax=ax) 

Total_genes_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Venn_Diagram(Total_Genes).svg")
plt.savefig(Total_genes_path , format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Verified_Genes_path}")
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Venn_Diagram(Verified_Genes).svg
No description has been provided for this image
InĀ [87]:
# Import hORFeome 9.1 datasheet to get verified genes
hORFeome9_1 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20220830_hORFeome 9.1.xlsx")
verified_hORFeome9_1 = hORFeome9_1[hORFeome9_1['Verified'] == 1]
unverified_hORFeome9_1 = hORFeome9_1[hORFeome9_1['Verified'] == 0]

# Create a copy of stat_filtered_df
original_stat_filtered_df = batch_compile_df.copy()

# Create a dictionary mapping entrez_gene_symbol to Verified values from hORFeome9_1
gene_symbol_to_verified = hORFeome9_1.set_index('entrez_gene_symbol')['Verified'].to_dict()

# Add a new "Verified" column to original_stat_filtered_df based on Gene_Symbol using .map
original_stat_filtered_df['Verified'] = batch_compile_df['Gene_Symbol'].map(gene_symbol_to_verified)

# Get the index of the "Gene_Symbol" column
gene_symbol_index = original_stat_filtered_df.columns.get_loc('Gene_Symbol')

# Create a list of columns with the "Verified" column moved to the desired position
new_columns = list(original_stat_filtered_df.columns)
new_columns.insert(gene_symbol_index + 1, 'Verified')

# Reorder the columns in the original_stat_filtered_df DataFrame
original_stat_filtered_df = original_stat_filtered_df[new_columns]

# Drop any duplicate "Verified" columns
original_stat_filtered_df = original_stat_filtered_df.loc[:, ~original_stat_filtered_df.columns.duplicated()]

# Filter the Verified Columns
verified_stat_filtered_df = original_stat_filtered_df[original_stat_filtered_df['Verified'] == 1]

# Convert to gene sets
verified_genes = set(verified_stat_filtered_df['Gene_Symbol'])
Original_genes = set(verified_hORFeome9_1['entrez_gene_symbol'])

# Calculate the intersections
intersection = verified_genes.intersection(Original_genes)
only_verified_genes = verified_genes - Original_genes
only_original_genes = Original_genes - verified_genes

# Create the Venn diagram
fig, ax = plt.subplots(figsize=(16, 12))  # Adjust figure size to provide space for labels
venn = venn2(subsets=(len(only_verified_genes), len(only_original_genes), len(intersection)),
             set_labels=('Analyzed Genes', 'hORFeome 9.1'),  # <<< --- THIS HIDES THE 'A' AND 'B' LABELS
            #  set_colors=("#BABDFF", '#CEFFBA'),
             ax=ax) # Ensure ax is passed if you have a custom figure/axes

# color_left_only = '#FFC0CB' # Pink
color_right_only = '#BAFFF8' # Light Blue
color_intersection = '#FFD0BA' # Medium Purple

# if venn.get_patch_by_id('10'):
#     venn.get_patch_by_id('10').set_facecolor(color_left_only)
#     venn.get_patch_by_id('10').set_alpha(1.0) # Ensure full opacity if desired

if venn.get_patch_by_id('01'):
    venn.get_patch_by_id('01').set_facecolor(color_right_only)
    venn.get_patch_by_id('01').set_alpha(1.0) # Ensure full opacity if desired

if venn.get_patch_by_id('11'):
    venn.get_patch_by_id('11').set_facecolor(color_intersection)
    venn.get_patch_by_id('11').set_alpha(1.0) # Ensure full opacity if desired

# subset_ids = ['10', '01', '11'] # '10': only A, '01': only B, '11': intersection
subset_ids = ['10'] # '10': only A, '01': only B, '11': intersection

for subset_id in subset_ids:
    label = venn.get_label_by_id(subset_id)
    if label: # Check if the label exists
        label.set_visible(False) # Set its visibility to False
        
subsets = ['01', '11']
for subset in subsets:
    number_label = venn.get_label_by_id(subset)
    if number_label: # Check if the label exists
        number_label.set_fontsize(30)
    
# # Add custom labels with boxes outside the Venn diagram
# plt.text(-1.5, 0.5, 'Verified Genes\nfrom analysis', fontsize=12,
#          bbox=dict(facecolor='cyan', edgecolor='black', boxstyle='round,pad=0.5'))
# plt.text(1.0, 0.5, 'Verified Genes\nfrom hORFeome 9.1', fontsize=12,
#          bbox=dict(facecolor='lightblue', edgecolor='black', boxstyle='round,pad=0.5'))

# # Calculate the percentage of interaction
intersection_percentage = (len(intersection) / len(Original_genes)) * 100

# Add the percentage text inside the plot with a box around it
plt.text(0.5, 0.65, f"Common Verified Genes: \n {intersection_percentage:.1f}%", 
         transform=plt.gca().transAxes,
         horizontalalignment='center', verticalalignment='center', fontsize=30, fontweight='bold',
        #  bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.5')
         )

label_A = venn.get_label_by_id('A')
label_B = venn.get_label_by_id('B')

if label_A:
    # Example: Move 'Analyzed Genes' (label A) slightly left and up from its default
    # You'll need to experiment with these (x,y) coordinates
    # label_A.set_position((-0.6, 0.1)) # (x, y) relative to diagram's center (0,0)
    label_A.set_fontsize(30) # You can also change font size here
    label_A.set_fontweight('bold') # Make it bold for emphasis

if label_B:
    # Example: Move 'hORFeome 9.1' (label B) slightly right and up from its default
    # label_B.set_position((0.65, 0.1))
    label_B.set_fontsize(30) # And here
    label_B.set_fontweight('bold') # Make it bold for emphasis

# Add a title
# plt.title("Venn Diagram for Verified Genes", fontsize=14)

# Define the TPM Histogram graph file path for graph storing
Verified_Genes_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Venn_Diagram(Verified_Genes).svg")
plt.savefig(Verified_Genes_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Verified_Genes_path}")

# Show the plot
plt.show()
# plt.close()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Venn_Diagram(Verified_Genes).svg
No description has been provided for this image

Automative statistical analysis pipeline¶

InĀ [88]:
# Assign handle - only when its necessary
statistical_analysis_handle = ""

# Allocate the below library (mix with other library)
from scipy.stats import chi2, t, norm

# Filter out numpy warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)

# Getting right database (inclusion of all triplets) - Only getting column header not the data
triplet_database_path = os.path.join(database_files_original, f"{directory}_nor_clean_removed.xlsx")
triplet_database = pd.read_excel(triplet_database_path)

# Make the columns triplets
triplet_to_include = [column for column in triplet_database.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
Triplet_maker = triplet_database[triplet_to_include]
names = Triplet_maker.columns.values

control = ["DMSO"]

Baseline = ["Baseline"]

drug_name_list = []

baseline_dmso_columns = [column for column in names if any(name in column for name in control)
                        or any(name in column for name in Baseline)]

selected_columns = [column for column in names if any(name in column for name in name_list)
                    or any(name in column for name in control)]
# triplets = [selected_columns[i:i + 3] for i in range(0, len(selected_columns), 3)]
triplets = list(OrderedDict((key, [item for item in selected_columns if item.split('-')[1] == key]) for key in [item.split('-')[1] for item in selected_columns]).values())

# To divide column names by DMSO controls
def DMSO_list(lst, word):
    result = []
    current_sublist = []

    for item in lst:
        if word in item[0]:
            if current_sublist:
                result.append(current_sublist)
            current_sublist = []

        current_sublist.append(item)

    if current_sublist:
        result.append(current_sublist)

    return result

divided_triplets = DMSO_list(triplets, "DMSO")

def drug_to_remove(divided_triplets, drug_to_exclude):
    updated_divided_triplets = []

    for sublist in divided_triplets:
        updated_sublist = []
        for inner_list in sublist:
            updated_inner_list = [drug for drug in inner_list if not any(exclude in drug for exclude in drug_to_exclude)]
            if updated_inner_list:
                updated_sublist.append(updated_inner_list)
        if updated_sublist:
            updated_divided_triplets.append(updated_sublist)

    return updated_divided_triplets

drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
updated_divided_triplets = drug_to_remove(divided_triplets, drug_to_exclude)

def separate_batches(batch_list):
    control_samples = []
    experimental_samples = []

    for i, batch in enumerate(batch_list, start=1):
        # Initialize lists for control and experimental samples within the batch
        batch_control = []
        batch_experimental = []

        for sub_batch in batch:
            # Check if the sub-batch contains "DMSO"
            if any("DMSO" in item for item in sub_batch):
                # Append the triplet to control samples
                batch_control.extend(sub_batch)
            else:
                # Append each triplet of drugs to experimental samples
                triplet_list = [sub_batch[j:j + 3] for j in range(0, len(sub_batch), 3)]
                batch_experimental.extend(triplet_list)

        # Append control and experimental samples within the batch
        control_samples.append((f"Batch {i} control_samples =", batch_control))
        experimental_samples.append((f"Batch {i} experimental_samples =", batch_experimental))

    return control_samples, experimental_samples

# Example usage
control_samples, experimental_samples = separate_batches(updated_divided_triplets)

def extract_name(input_list):
    drug_names = set()

    for item in input_list:
        parts = item.split('-')
        if len(parts) == 3 and parts[1].isalnum():
            drug_names.add(parts[1])

    return ' '.join(drug_names)

# Permutation testing initiation
Iter_num = 10000
min_n_sample_for_testing = 2

# Create a copy of batch_compile_stat_df
meanFC_allpval_df = batch_compile_stat_df.copy()
meanFC_allpval_df = meanFC_allpval_df.reset_index()
only_meanFC_allpval_df = batch_compile_stat_df.copy()
only_meanFC_allpval_df = only_meanFC_allpval_df.iloc[:, :4]
only_meanFC_allpval_df = only_meanFC_allpval_df.reset_index()

# Print the results
for i, (control_sample_name, control_sample_data), (experimental_sample_name, experimental_sample_data) in zip(
        range(1, len(control_samples) + 1), control_samples, experimental_samples):
    for exp_sample in experimental_sample_data:
        drug_name = extract_name(exp_sample)
        drug_name_list.append(drug_name)
        if statistical_analysis_handle == "Yes":
    
            print(drug_name)
            nullFC_list = []
            pvalue_list = []
            FC_list = []
            nan_no_list = []

            # Access the DataFrame using the individual column names
            tested_columns = np.array(batch_compile_stat_df[control_sample_data + exp_sample])
            tested_columns = np.array([[float(val) if val != 'nan' else np.nan for val in row] for row in tested_columns],
                                    dtype=float)
            [row_num, col_num] = tested_columns.shape
            treat_num = len(exp_sample)
            control_num = len(control_sample_data)

            trnd = np.zeros([row_num, Iter_num])
            wrnd = np.zeros([row_num, Iter_num])
            mrnd = np.zeros([row_num, Iter_num])
            wzrnd = np.zeros([row_num, Iter_num])

            [trnd, wrnd, mrnd, wzrnd] = permutation_test(tested_columns, treat_num, control_num, Iter_num)
            nullFC_list.append(pd.DataFrame(mrnd).stack(level=-1, dropna=False).values)

            x = tested_columns[:, control_num:]  # treatment group
            y = tested_columns[:, :control_num]  # control group

            nonnan_count_x = np.isfinite(x).sum(axis=1)
            nonnan_count_y = np.isfinite(y).sum(axis=1)
            nonnan_bool_x = np.array([np.nan if i < min_n_sample_for_testing else 1 for i in nonnan_count_x])
            nonnan_bool_y = np.array([np.nan if i < min_n_sample_for_testing else 1 for i in nonnan_count_y])
            nonnan_bool = nonnan_bool_x * nonnan_bool_y
            nan_no_list.append([0 if pd.isnull(i) else 1 for i in nonnan_bool])

            [pt, pw, pm, pwz, ovp1] = pval_calc(trnd, wrnd, mrnd, wzrnd, x, y, True, 1)
            [pt, pw, pm, pwz, ovp1] = [pt, pw, pm, pwz, ovp1] * nonnan_bool
            [pt, pw, pm, pwz, ovp3] = pval_calc(trnd, wrnd, mrnd, wzrnd, x, y, True, 3)
            [pt, pw, pm, pwz, ovp3] = [pt, pw, pm, pwz, ovp3] * nonnan_bool

            pvalue_list.append([pt, pw, pm, pwz, ovp1, ovp3])

            FC = np.nanmean(x, axis=1) - np.nanmean(y, axis=1)
            FC = FC * nonnan_bool
            FC_list.append(FC)

            # Extract the drug name
            testing_var = ['nan_filter', 'FC', 'pt', 'pw', 'pm', 'pwz', 'ovp1', 'ovp3']
            df_testing_result = pd.concat([pd.concat([pd.DataFrame(nan_no_list[idx]).T, pd.DataFrame(FC_list[idx]).T, pd.DataFrame(i)], axis=0) for idx,i in enumerate(pvalue_list)], axis = 1).T
            df_testing_result.columns = [f"{var}_{drug_name}" for var in testing_var]
            meanFC_allpval_df = pd.concat([meanFC_allpval_df, df_testing_result], axis=1)
            only_meanFC_allpval_df = pd.concat([only_meanFC_allpval_df, df_testing_result], axis=1)
if statistical_analysis_handle == "Yes":
    meanFC_allpval_df.set_index('index', inplace=True)
    meanFC_allpval_df.index.name = None
    only_meanFC_allpval_df.set_index('index', inplace=True)
    only_meanFC_allpval_df.index.name = None

    # Saving the DataFrame
    meanFC_allpval_Path = os.path.join(database_files_stats, f"{directory}_all_meanFC_allpval.xlsx")
    meanFC_allpval_df.to_excel(meanFC_allpval_Path, index=False)
    only_meanFC_allpval_Path = os.path.join(database_files_stats, f"{directory}_only_meanFC_allpval.xlsx")
    only_meanFC_allpval_df.to_excel(only_meanFC_allpval_Path, index=False)
else:
    print("Statiscial analysis handle is not set")

only_meanFC_allpval_Path = os.path.join(database_files_stats, f"{directory}_only_meanFC_allpval.xlsx")

# After the code, you can reset the warnings to their default behavior
warnings.resetwarnings()
Statiscial analysis handle is not set
InĀ [89]:
only_meanFC_allpval_df = pd.read_excel(only_meanFC_allpval_Path)

# List of p_values to include
p_values = ['pt', 'pw', 'pm', 'pwz', 'ovp1', 'ovp3']

# Columns to select for each drug
columns_to_select = ['ORF_ID', 'NCBI', 'Group', 'Gene_Symbol']

# Iterate over each p-value
for p_value in p_values:
    # Create an empty DataFrame to store the results for this p-value
    result_df = only_meanFC_allpval_df[columns_to_select]
    
    # Iterate over each drug name and create a dataframe for each drug with the selected p-value
    for drug_name in drug_name_list:
        # Select columns that contain drug_name and the specified p-value
        relevant_cols =  [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'{p_value}_{drug_name}']
        
        # Filter the DataFrame to include only the relevant columns
        relevant_df = only_meanFC_allpval_df[relevant_cols].copy()
        
        # Append the relevant data for this drug to the result DataFrame
        result_df = pd.concat([result_df, relevant_df], axis=1)
    file_name = f'{directory}_all_meanFC_{p_value}.xlsx'
    p_value_path = os.path.join(database_files_stats, file_name)
    result_df.to_excel(p_value_path, index=False)

Saving separate P value dataframes¶

Evaluation of mutiple P values¶

InĀ [90]:
# List of p_values to include
p_values = ['pt', 'pw', 'pm', 'pwz', 'ovp1', 'ovp3']

# Initialize a dictionary to store counts for each drug and p-value
counts_per_drug_per_pvalue = {drug_name: {p_value: 0 for p_value in p_values} for drug_name in drug_name_list}

# Iterate over each p-value
for p_value in p_values:
    # Iterate over each drug name
    for drug_name in drug_name_list:
        # Select relevant columns for the specified drug and p-value
        relevant_cols = [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'{p_value}_{drug_name}']
        
        # Filter the DataFrame based on specified conditions
        filtered_df = only_meanFC_allpval_df[
                            (only_meanFC_allpval_df[relevant_cols[0]] == 1) &
                            ((only_meanFC_allpval_df[relevant_cols[1]] > 0.5) | (only_meanFC_allpval_df[relevant_cols[1]] < -0.5)) &
                            (only_meanFC_allpval_df[f'{p_value}_{drug_name}'] < 0.05)
                                            ]
        
        # Count the number of genes in the filtered DataFrame
        gene_count = filtered_df.shape[0]

        # Update the counts for this drug and p-value
        counts_per_drug_per_pvalue[drug_name][p_value] = gene_count

# Create subplots for each p-value
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 16))
axes = axes.flatten()

for i, p_value in enumerate(p_values):
    drug_names = list(counts_per_drug_per_pvalue.keys())
    counts = [counts_per_drug_per_pvalue[drug_name][p_value] for drug_name in drug_names]

    # Get the colors for the bars based on the drug_color_map
    colors = [drug_color_map[drug_name] for drug_name in drug_names]

    ax = axes[i]
    bars = ax.bar(drug_names, counts, color=colors)

    # Set the background to white and remove the edges
    ax.set_facecolor('white')
    for bar in bars:
        bar.set_edgecolor('none')

    ax.set_ylabel('Number of Genes',  fontsize=12)
    ax.set_title(f'Filtered genes for {p_value}',  fontsize=20)
    ax.tick_params(axis='x', rotation=90, labelsize=11)
    ax.tick_params(axis='y', labelsize=12)

# Remove empty subplots if any
for i in range(len(p_values), len(axes)):
    fig.delaxes(axes[i])

# Define the file path for storing graph
stat_test_path = os.path.join(graphs_files_stats, f"{directory}_pval_filter.svg")
plt.savefig(stat_test_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_pval_filter.svg to {stat_test_path}")

plt.tight_layout()
plt.show()
RQ023682_pval_filter.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_pval_filter.svg
No description has been provided for this image
InĀ [91]:
# Make a copy of the original dictionary
drug_categories = drug_category.copy()
color_categories = category_colors.copy()

# Check if 'Control' key exists in the dictionary and remove it if present
if 'Control' in drug_categories:
    del drug_categories['Control']

if 'Control' in color_categories:
    del color_categories['Control']

Cumulative Number of Filtered Genes as Number of Drugs increases¶

InĀ [92]:
# Path to the DataFrame file
Final_pval_path = os.path.join(database_files_stats, f"{directory}_all_meanFC_ovp3.xlsx")

# Read the DataFrame
Final_pval = pd.read_excel(Final_pval_path)

# Dictionary to store filtered DataFrames for each drug
filter_dfs = {}

# Iterate over each drug and filter the DataFrame
for drug_name in drug_name_list:
    relevant_cols = [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'ovp3_{drug_name}']
    
    # Filter the DataFrame based on specified conditions for the current drug
    filtered_df = Final_pval[
        (Final_pval[relevant_cols[0]] == 1) &
        ((Final_pval[relevant_cols[1]] > 0.5) | (Final_pval[relevant_cols[1]] < -0.5)) &
        (Final_pval[f'ovp3_{drug_name}'] < 0.05)
    ]
    
    # Store the unique genes (rows) for the current drug
    filter_dfs[drug_name] = set(filtered_df['NCBI'])

# Sort the drugs by the size of their filtered sets
sorted_drugs = sorted(filter_dfs, key=lambda x: len(filter_dfs[x]),reverse=True)

# Calculate the cumulative count of genes for each cumulative drug count
cumulative_gene_counts = []

# Start with the smallest set of genes
smallest_genes = filter_dfs[sorted_drugs[0]]
cumulative_gene_counts.append(len(smallest_genes))

# Iterate over the drugs, starting from the second smallest
for drug_name in sorted_drugs[1:]:
    # Add genes from the next drug while avoiding duplicates
    smallest_genes.update(filter_dfs[drug_name])
    cumulative_gene_counts.append(len(smallest_genes))

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(drug_name_list) + 1), cumulative_gene_counts, marker='o', color='black')
plt.xlabel('Cumulative Drug Count (Sorted by Gene Count)')
plt.ylabel('Cumulative Gene Count')
plt.title('Number of Cumulative Genes')
plt.xticks(range(1, len(drug_name_list) + 1))
plt.grid(True)
# Format y-axis labels with a comma for thousands separator
plt.gca().get_yaxis().set_major_formatter(mticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.show()
No description has been provided for this image
InĀ [93]:
from scipy import stats

# Function to perform permutation testing by swapping genes within samples (treatments)
def permuted_cumulative_gene_count(sorted_drugs, filter_dfs):
    genes = set()
    cumulative_gene_counts = []

    for drug_name in sorted_drugs:
        # Get the set of genes for the current drug
        drug_genes = list(filter_dfs[drug_name])

        # Shuffle the order of genes within the drug (sample)
        permuted_genes = random.sample(drug_genes, len(drug_genes))
        genes.update(permuted_genes)
        cumulative_gene_counts.append(len(genes))

    return cumulative_gene_counts

num_permutations = 1000  # You can adjust the number of permutations as needed
permuted_counts = []

for _ in range(num_permutations):
    random.shuffle(sorted_drugs)  # Randomly shuffle the drug order
    permuted_counts.append(permuted_cumulative_gene_count(sorted_drugs, filter_dfs))

# Calculate the degrees of freedom for the t-distribution
degrees_of_freedom = num_permutations - 1
confidence_level = 0.9999
t_critical = abs(stats.t.ppf((1 - confidence_level) / 2, degrees_of_freedom))
mean_counts = np.mean(permuted_counts, axis=0)
std_dev = np.std(permuted_counts, axis=0)
standard_error = std_dev / np.sqrt(num_permutations)
upper_bound = mean_counts + t_critical * standard_error  # 95% confidence interval
lower_bound = mean_counts - t_critical * standard_error

plt.figure(figsize=(16,8))
ax = plt.gca()
plt.plot(range(1, len(drug_name_list) + 1), mean_counts, marker='o', color='black', label='Set of Genes', linewidth=3, markersize=10)
plt.fill_between(range(1, len(drug_name_list) + 1), lower_bound, upper_bound, color='#F6CFFC', alpha=1, label='95% CI')
plt.xlabel('Cumulative Number of chemotherapeutics')
plt.ylabel('Cumulative Number of Genes')
plt.title(' ')
plt.xticks(range(1, len(drug_name_list) + 1))
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.grid(False)
plt.gca().get_yaxis().set_major_formatter(mticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.legend(frameon=False)

# Define the TPM Histogram graph file path for graph storing
Permuted_Genes_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Permuted_Genes.svg")
plt.savefig(Permuted_Genes_path , format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Permuted_Genes_path}")
plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Permuted_Genes.svg
No description has been provided for this image

Numbers of Drugs pass the filters in Genes¶

InĀ [94]:
# Path to the DataFrame file
Final_pval_path = os.path.join(database_files_stats, f"{directory}_all_meanFC_ovp3.xlsx")

# Read the DataFrame
Final_pval = pd.read_excel(Final_pval_path)

# Create a DataFrame to store the filter results (True/False for each data point)
filter_results = pd.DataFrame(index=Final_pval.index)

# Iterate over each drug and check if each data point passes the filter
for drug_name in drug_name_list:
    relevant_cols = [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'ovp3_{drug_name}']
    
    # Apply the filter conditions and store the results in the filter_results DataFrame
    filter_results[drug_name] = (
        (Final_pval[relevant_cols[0]] == 1) &
        (
        (Final_pval[relevant_cols[1]] >= 0.5) 
        | 
        (Final_pval[relevant_cols[1]] <= -0.5)
        ) 
        &
        (Final_pval[f'ovp3_{drug_name}'] <= 0.05)
    )

# Count how many 'True' values there are per row in the filter_results DataFrame
num_drugs_passed_filter = filter_results.sum(axis=1)

# Count how many genes have a specific number of drugs passing the filter (0 to 16)
count_per_num_drugs = num_drugs_passed_filter.value_counts().sort_index()

# Create a list to store the counts of "True" values in each row
true_counts = filter_results.apply(lambda row: row.sum(), axis=1)

# Find the rows where the count of "True" values is greater than or equal to 7
selected_rows = filter_results[true_counts >= 7]

# Create an empty DataFrame to store the count of "True" values for each category
category_counts = pd.DataFrame(index=selected_rows.index)

for category, drugs in drug_categories.items():
    # Count the number of "True" values for each drug category and assign the category name as the column header
    category_counts[category] = selected_rows[drugs].sum(axis=1)

# Create a condition for at least three categories with at least two "True" values
final_condition = (category_counts >= 1).sum(axis=1) >= 3

# Apply the final condition to select rows
reselected_rows = selected_rows[final_condition]
core_indices = reselected_rows.index

# Finding Core Gene NCBI 
Core_Genes = Final_pval.loc[core_indices, 'NCBI']
Core_Genes = sorted(list(Core_Genes))

# Get unique sorted values from num_drugs_passed_filter
sorted_keys = sorted(num_drugs_passed_filter.unique())

# Initialize an empty dictionary to store the results
Number_drugs_dict = {}

# Iterate through the sorted keys
for key in sorted_keys:
    # Get the corresponding 'NCBI' values for this key
    ncbi_values = Final_pval.loc[num_drugs_passed_filter == key, 'NCBI'].tolist()

    # Sort the 'NCBI' values within the key
    ncbi_values.sort()

    # Add the key-value pair to the results dictionary
    Number_drugs_dict[key] = ncbi_values

# Additional column indicating what drugs that has passed filter in particular genes 
num_drugs_passed_filter_with_drugs = num_drugs_passed_filter.index.to_series().apply(
    lambda index: '|'.join(filter_results.columns[filter_results.loc[index]]))
num_drugs_passed_filter_with_drugs.index = num_drugs_passed_filter.index

# cmap = plt.cm.cool
cmap = plt.cm.copper
num_bars = len(count_per_num_drugs)
colors = [cmap(i) for i in np.linspace(0, 1, num_bars)]

# Create a histogram plot
plt.figure(figsize=(16,8))
ax = plt.gca()
plt.bar(count_per_num_drugs.index, count_per_num_drugs.values, align='center', alpha=0.7, color=colors)
plt.yscale('log')  # Set y-axis to log scale
plt.xlabel('Number of Chemothrapeuics with DEGs')
plt.ylabel('Number of Genes')
plt.gca().get_yaxis().set_major_formatter(mticker.FuncFormatter(lambda x, p: format(int(x), ',')))
# plt.title('Histogram of Genes by Number of Drugs Passing Filter')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xticks(range(len(drug_name_list)+1))

# Define the TPM Histogram graph file path for graph storing
Filtered_Genes_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Filtered_Genes.svg")
plt.savefig(Filtered_Genes_path  , format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Filtered_Genes_path }")
plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Filtered_Genes.svg
No description has been provided for this image
InĀ [95]:
# Additional column indicating what drugs passed filter with direction (↑/↓)
def get_passed_drugs_with_arrows(index):
    passed_drugs = []
    for drug in filter_results.columns:
        if filter_results.loc[index, drug]:
            fc = Final_pval.loc[index, f'FC_{drug}']
            arrow = '↑' if fc >= 0.5 else '↓'
            passed_drugs.append(f"{drug}{arrow}")
    return '|'.join(passed_drugs)

num_drugs_passed_filter_with_drugs = num_drugs_passed_filter.index.to_series().apply(get_passed_drugs_with_arrows)
num_drugs_passed_filter_with_drugs.index = num_drugs_passed_filter.index
InĀ [96]:
# Define the observed data
observed_data = np.arange(len(count_per_num_drugs))  # The x-values (0 to 13)
counts = np.array(count_per_num_drugs)  # The corresponding counts

# Define the probability density function for the mixture distribution
def mixture_pdf(x, params):
    weight_gamma, alpha, beta, mu, sigma = params
    gamma_pdf = weight_gamma * gamma.pdf(x, alpha, scale=1/beta)
    normal_pdf = (1 - weight_gamma) * norm.pdf(x, loc=mu, scale=sigma)
    return gamma_pdf + normal_pdf

# Define the negative log-likelihood function to be minimized
def negative_log_likelihood(params):
    return -np.sum(counts * np.log(mixture_pdf(observed_data, params)))

# Initial guess for parameters
initial_params = [0.5, 1, 1, 6, 3]  # Adjust initial parameters as needed

# Define bounds for the parameters
bounds = [(0, 1), (0, None), (0, None), (0, None), (0, None)]

# Minimize the negative log-likelihood to estimate parameters
result = minimize(negative_log_likelihood, initial_params, bounds=bounds)

# Get the estimated parameters
estimated_params = result.x

# Generate x values for the plot
x = np.arange(0, 14, 0.1)

# Plot the observed data and the fitted mixture distribution
plt.figure(figsize=(8, 6))
plt.bar(observed_data, counts, color='black', alpha=0.7, label='Observed Data')
plt.plot(x, counts.sum() * mixture_pdf(x, estimated_params), 'r-', label='Mixture Distribution', linewidth=2)
# plt.yscale('log')
plt.xlabel('Number of Drugs Passing Filter')
plt.ylabel('Gene Count')
plt.title('Mixture Distribution Fit to Observed Data')
plt.legend()
plt.show()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/scipy/optimize/_numdiff.py:596: RuntimeWarning:

invalid value encountered in subtract

No description has been provided for this image
InĀ [97]:
# Define the probability density function for the mixture distribution
def mixture_pdf(x, params):
    weight_gamma, alpha, beta, mu, sigma = params
    gamma_pdf = weight_gamma * gamma.pdf(x, alpha, scale=1/beta)
    normal_pdf = (1 - weight_gamma) * norm.pdf(x, loc=mu, scale=sigma)
    return gamma_pdf, normal_pdf

# Define the negative log-likelihood function to be minimized
def negative_log_likelihood(params):
    gamma_pdf, normal_pdf = mixture_pdf(observed_data, params)
    return -np.sum(counts * np.log(gamma_pdf + normal_pdf))

# Initial guess for parameters
initial_params = [0.5, 1, 1, 6, 3]  # Adjust initial parameters as needed

# Define bounds for the parameters
bounds = [(0, 1), (0, None), (0, None), (0, None), (0, None)]

# Minimize the negative log-likelihood to estimate parameters
result = minimize(negative_log_likelihood, initial_params, bounds=bounds)

# Get the estimated parameters
estimated_params = result.x

# Calculate the cutoff point by finding the intersection
cutoff = brentq(lambda x: mixture_pdf(x, estimated_params)[0] - mixture_pdf(x, estimated_params)[1], 0, 14)

# Generate x values for the plot
x = np.arange(0, 14, 0.1)

# Create a subplot with 2 rows and 1 column
plt.figure(figsize=(30, 12))

# Plot the Gamma component in the first subplot
plt.subplot(1, 2, 1)
gamma_pdf, normal_pdf = mixture_pdf(x, estimated_params)
plt.plot(x, counts.sum() * gamma_pdf, 'r-', label='Gamma Distribution', linewidth=2)
plt.xlabel('Number of Drugs Passing Filter')
plt.ylabel('Probability Density')
plt.title('Gamma Distribution')
plt.legend()

# Plot the Normal component in the second subplot
plt.subplot(1, 2, 2)
plt.plot(x, counts.sum() * normal_pdf, 'b-', label='Normal Distribution', linewidth=2)
plt.xlabel('Number of Drugs Passing Filter')
plt.ylabel('Probability Density')
plt.title('Normal Distribution')
plt.legend()

# Create a subplot with 2 rows and 1 column
plt.figure(figsize=(16, 16))
ax = plt.gca()

# Plot the Gamma component
plt.plot(x, counts.sum() * gamma_pdf, 'r-', label='Gamma Distribution', linewidth=10)
# Plot the Normal component
plt.plot(x, counts.sum() * normal_pdf, 'b-', label='Normal Distribution', linewidth=10)

# Plot the cutoff point
plt.axvline(cutoff, color='black', linestyle='--', label='Cutoff Point', linewidth=5)

plt.xlabel('Number of Drugs Passing Filter')
plt.ylabel('Probability Density')
plt.title('')
plt.legend( loc='upper right')
plt.yticks(np.arange(0, 8001, 2000))

# # Format the cutoff value to 3 significant figures
# cutoff_text = f"Cutoff: {cutoff:.3g}"
# # Get the current y-axis limits to place the text appropriately
# ymin, ymax = plt.ylim()
# # Choose a y-position for the text (e.g., 90% of the way up the y-axis)
# text_y_position = ymin + (ymax - ymin) * 0.2
# Add the text slightly to the right of the line
# plt.text(cutoff + 0.2, text_y_position, cutoff_text,
#          color='black',
#          fontsize=plt.rcParams['font.size'] * 1, # Slightly smaller than global font size
#          horizontalalignment='left', # Align text to its left
#          verticalalignment='center',
#          fontweight='bold') # Vertically center the text

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

# Adjust the spacing between subplots
plt.tight_layout()

# Define the TPM Histogram graph file path for graph storing
Multi_Norm_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Multi_norms.svg")
plt.savefig(Multi_Norm_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Multi_Norm_path}")

# Show the plot
plt.show()
# plt.close()

print("Cutoff Point:", cutoff)
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Multi_norms.svg
No description has been provided for this image
No description has been provided for this image
Cutoff Point: 2.6434598205529287

Categorical Genes¶

InĀ [98]:
# Assign handle - only when its necessary
categorical_analysis_handle = ""

# Allocate the below library (mix with other library)
from scipy.stats import chi2, t, norm

# Filter out numpy warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)

# Permutation testing initiation
Iter_num = 10000
min_n_sample_for_testing = 2

# Create a dataframe for categorical analysis
category_df = Final_pval.copy()
# List to store the ordered column names
ordered_columns = []
desired_order = ["Antimetabolite", "DNA cross linking agent", "DNA strand break agent", "Microtubule inhibitor"]
# Iterate through the desired order of drug categories
for category in desired_order:
    drugs_in_category = drug_category.get(category, [])
    for drug in drugs_in_category:
        # Add the FC and pval columns for each drug
        ordered_columns.extend([f"FC_{drug}", f"ovp3_{drug}"])
# Reorder the columns in the DataFrame and reset the index
category_df = category_df[ordered_columns]
fc_columns = [col for col in category_df.columns if "FC" in col]
category_df = category_df[fc_columns]

# Create a dataframe for categorical analysis 
categorical_df = batch_compile_stat_df.iloc[:, :4]
categorical_df = categorical_df.reset_index()

# Create a dictionary to hold control samples for each category
category_control_samples = {category: [] for category in drug_categories}

for category in drug_categories.keys():
    experimental_samples = []
    control_samples = []

    for name in ordered_columns:
        drug_name = name.split('_')[1]

        if drug_name in drug_categories[category]:
            experimental_samples.append(name)
        elif category not in [cat for cat, drugs in drug_categories.items() if drug_name in drugs]:
            control_samples.append(name)
    if categorical_analysis_handle == "Yes":
        print(category)
        nullFC_list = []
        pvalue_list = []
        FC_list = []
        nan_no_list = []

        # Access the DataFrame using the individual column names
        tested_columns = np.array(category_df[control_samples + experimental_samples])
        tested_columns = np.array([[float(val) if val != 'nan' else np.nan for val in row] for row in tested_columns],
                                dtype=float)
        [row_num, col_num] = tested_columns.shape
        treat_num = len(experimental_samples)
        control_num = len(control_samples)

        trnd = np.zeros([row_num, Iter_num])
        wrnd = np.zeros([row_num, Iter_num])
        mrnd = np.zeros([row_num, Iter_num])
        wzrnd = np.zeros([row_num, Iter_num])

        [trnd, wrnd, mrnd, wzrnd] = permutation_test(tested_columns, treat_num, control_num, Iter_num)
        nullFC_list.append(pd.DataFrame(mrnd).stack(level=-1, dropna=False).values)

        x = tested_columns[:, control_num:]  # treatment group
        y = tested_columns[:, :control_num]  # control group

        nonnan_count_x = np.isfinite(x).sum(axis=1)
        nonnan_count_y = np.isfinite(y).sum(axis=1)
        nonnan_bool_x = np.array([np.nan if i < min_n_sample_for_testing else 1 for i in nonnan_count_x])
        nonnan_bool_y = np.array([np.nan if i < min_n_sample_for_testing else 1 for i in nonnan_count_y])
        nonnan_bool = nonnan_bool_x * nonnan_bool_y
        nan_no_list.append([0 if pd.isnull(i) else 1 for i in nonnan_bool])

        [pt, pw, pm, pwz, ovp3] = pval_calc(trnd, wrnd, mrnd, wzrnd, x, y, True, 3)
        [pt, pw, pm, pwz, ovp3] = [pt, pw, pm, pwz, ovp3] * nonnan_bool

        pvalue_list.append([ovp3])

        FC = np.nanmedian(x, axis=1) - np.nanmedian(y, axis=1)
        FC = FC * nonnan_bool
        FC_list.append(FC)

        # Extract the drug name
        testing_var = ['nan_filter', 'FC', 'ovp3']
        df_testing_result = pd.concat([pd.concat([pd.DataFrame(nan_no_list[idx]).T, pd.DataFrame(FC_list[idx]).T, pd.DataFrame(i)], axis=0) for idx,i in enumerate(pvalue_list)], axis = 1).T
        # df_testing_result.columns = [f"{var}_{drug}" for var in testing_var]
        df_testing_result.columns = [f"{var}_{category}" for var in testing_var]
        categorical_df = pd.concat([categorical_df, df_testing_result], axis=1)

if categorical_analysis_handle == "Yes":
    categorical_df.set_index('index', inplace=True)
    categorical_df.index.name = None

    # Saving the DataFrame
    categorical_Path = os.path.join(database_files_stats, f"{directory}_categorical_analysis.xlsx")
    categorical_df.to_excel(categorical_Path , index=False)
else:
    print("Categorical analysis handle is not set")

# After the code, you can reset the warnings to their default behavior
warnings.resetwarnings()
Categorical analysis handle is not set
InĀ [99]:
# Recalling Categorical & Peripheral dataframes
categorical_Path = os.path.join(database_files_stats, f"{directory}_categorical_analysis.xlsx")
categorical_df = pd.read_excel(categorical_Path)

## Modifying the dataframes
# Remove columns containing 'nan_filter'
categorical_mod_df = categorical_df.drop(columns=[col for col in categorical_df.columns if 'nan_filter' in col])

# Fill NaN values in 'FC' columns with 0
categorical_fc_columns = [col for col in categorical_mod_df.columns if 'FC' in col]
categorical_mod_df[categorical_fc_columns] = categorical_mod_df[categorical_fc_columns].fillna(0)

# Replace NaN values in 'ovp3' columns with 1
categorical_ovp3_columns = [col for col in categorical_mod_df.columns if 'ovp3' in col]
categorical_mod_df[categorical_ovp3_columns] = categorical_mod_df[categorical_ovp3_columns].fillna(1)

# Replace 'ovp3' with 'pval' in column names
categorical_mod_df.columns = [col.replace('ovp3', 'pval') for col in categorical_mod_df.columns]
InĀ [100]:
# Dictionary to store filtered DataFrames for each drug
category_keys = list(drug_categories.keys())

# Create a figure and axes
fig, ax = plt.subplots(figsize=(15, 15))

# Iterate over each drug and filter the DataFrame
for category_name in category_keys:
    fc_column = f'FC_{category_name}'
    data = categorical_mod_df[fc_column]
    ax.hist(data, bins=20, alpha=0.5, label=f'FC_{category_name}')

# Set plot titles and labels
ax.set_title('Histogram of FC Values for Categories')
ax.set_xlabel('FC Values')
ax.set_ylabel('Frequency')

# Add a legend
ax.legend()

# Display the plot
plt.show()
No description has been provided for this image

Number of Unique Genes per Categories¶

InĀ [101]:
# Dictionary to store filtered DataFrames for each drug
category_keys = list(drug_categories.keys())

category_dfs = {}
category_dfs_no = {}
category_dfs_select = {}
category_dfs_select_no = {}

# Iterate over each drug and filter the DataFrame
for category_name in category_keys:
    relevant_cols = [f'FC_{category_name}', f'pval_{category_name}']

    # Filter the DataFrame based on specified conditions for the current drug
    category_df = categorical_mod_df[
        (
            (categorical_mod_df[relevant_cols[0]] >= 1.0)
            |
            (categorical_mod_df[relevant_cols[0]] <= -1.0)
        )
        &
        (categorical_mod_df[relevant_cols[1]] <= 0.01)
    ]

    # Store the unique genes (rows) for the current drug
    category_dfs[category_name] = set(category_df['NCBI'])
    category_dfs_no[category_name] = len(category_df['NCBI'])

    # Store the unique genes (rows) for the current drug in a set
    selected_genes = set(category_df['NCBI'])

    # Check for other drugs and remove genes with p-value < 0.05 for other drugs
    for other_category in category_keys:
        if other_category != category_name:
            other_cols = [f'FC_{other_category}', f'pval_{other_category}']
            genes_to_remove = categorical_mod_df[
                (
                (categorical_mod_df[other_cols[0]] >= 1.0)
                |
                (categorical_mod_df[other_cols[0]] <= -1.0)
                )
                &   
                (
                    (categorical_mod_df[other_cols[1]] <= 0.01)
                )
            ]['NCBI']
            selected_genes.difference_update(set(genes_to_remove))

    # Store the selected genes in the dictionary
    category_dfs_select[category_name] = set(selected_genes) - set(Core_Genes)
    category_dfs_select_no[category_name] = len(category_dfs_select[category_name])

# Create a bar plot for the number of unique genes in each category with specified colors
plt.figure(figsize=(16, 8))
ax = plt.gca()
for i, (category, color) in enumerate(color_categories.items()):
    plt.bar(i, category_dfs_select_no.get(category, 0), color=color, label=category)

# plt.xlabel('Chemotherapeutics Categories')
plt.xlabel('')
plt.ylabel('Number of Unique Genes')
plt.title('')
# plt.xticks(range(len(color_categories)), list(drug_categories.keys()), rotation=45)
plt.xticks([])
# legend = plt.legend(loc='upper left', bbox_to_anchor=(1, 5), title='Categories')
# plt.legend()
plt.legend(loc='upper left', bbox_to_anchor=(1, 1.027), fontsize=25, title= 'Chemotherapeutics Categories', title_fontsize=30, frameon=False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

# Define the TPM Histogram graph file path for graph storing
Categorical_graph_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Categorical_Graph.svg")
plt.savefig(Categorical_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Categorical_graph_path}")
plt.tight_layout()
plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Categorical_Graph.svg
No description has been provided for this image
InĀ [102]:
# Dictionary to store filtered DataFrames for each drug
peripheral_dfs = {}
peripheral_dfs_no = {}
peripheral_dfs_select = {}
peripheral_dfs_select_no = {}

# Iterate over each drug and check if each data point passes the filter
for drug_name in drug_name_list:
    relevant_cols = [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'ovp3_{drug_name}']
    
    # Apply the filter conditions and store the results in the filter_results DataFrame
    peripheral_df = Final_pval[
        (Final_pval[relevant_cols[0]] == 1) &
        (
        (Final_pval[relevant_cols[1]] >= 0.5) 
        | 
        (Final_pval[relevant_cols[1]] <= -0.5)
        ) 
        &
        (Final_pval[f'ovp3_{drug_name}'] <= 0.05)
    ]
    peripheral_dfs[drug_name] = set(peripheral_df['NCBI'])
    peripheral_dfs_no[drug_name] = len(peripheral_df['NCBI'])

    # Store the unique genes (rows) for the current drug in a set
    selected_genes = set(peripheral_df['NCBI'])

    # Check for other drugs and remove genes with p-value < 0.05 for other drugs
    for other_category in drug_name_list:
        if other_category != drug_name:
            other_cols = [f'nan_filter_{other_category}', f'FC_{other_category}', f'ovp3_{other_category}']
            genes_to_remove = Final_pval[
                        (Final_pval[other_cols[0]] == 1) &
                        (
                        (Final_pval[other_cols[1]] >= 0.5) 
                        | 
                        (Final_pval[other_cols[1]] <= -0.5)
                        ) 
                &
                (
                    (Final_pval[other_cols[2]] <= 0.05)
                )
            ]['NCBI']
            selected_genes.difference_update(set(genes_to_remove))
    
    # Store the selected genes in the dictionary
    peripheral_dfs_select[drug_name] = set(selected_genes - set(Core_Genes))
    peripheral_dfs_select_no[drug_name] = len(peripheral_dfs_select[drug_name])

# Create an empty set to store all genes from category_dfs_select   
all_category_genes = set()

# Iterate over each category and add its genes to the set
for genes in category_dfs_select.values():
    all_category_genes.update(genes)
    
    for drug_name in drug_name_list:
        # Remove overlapping genes between peripheral_dfs_select and the current category_genes
        peripheral_dfs_select[drug_name] = list(set(peripheral_dfs_select[drug_name]) - all_category_genes)
        
        # Update the length after removing overlapping genes
        peripheral_dfs_select_no[drug_name] = len(peripheral_dfs_select[drug_name])

desired_order = ["Antimetabolite", "DNA cross linking agent", "DNA strand break agent", "Microtubule inhibitor"]
# Create a list of tuples with drug names and their corresponding values
drugs_with_values = [(drug, peripheral_dfs_select_no[drug]) for category in desired_order for drug in drug_categories.get(category, [])]
# Sort the list of tuples based on the order in 'drugs_with_values'
sorted_drugs_with_values = sorted(drugs_with_values, key=lambda item: drugs_with_values.index(item))
# Create a new dictionary with the sorted drug-value pairs
sorted_peripheral_dfs_select_no = dict(sorted_drugs_with_values)

# Flatten the list of drugs from the categories
all_drugs = [drug for category in desired_order for drug in drug_categories.get(category, [])]
# Now, you can sort the drugs based on the order in the 'all_drugs' list
sorted_drugs = sorted(all_drugs, key=lambda drug: all_drugs.index(drug))

# Create a bar plot for the number of unique genes in each category with specified colors
plt.figure(figsize=(16, 11.5))
ax = plt.gca()
# Iterate over the drugs in drug_name_list
for i, drug in enumerate(sorted_drugs):
    if drug in drug_color_map:
        category = drug
        color = drug_color_map[drug]
        plt.bar(i, sorted_peripheral_dfs_select_no.get(category, 0), color=color, label=category)

plt.xlabel('')
plt.ylabel('Number of Unique Genes', fontsize=40)
# plt.title('Unique Genes per chemotherapies')
plt.title('')
# plt.xticks(range(len(sorted_drugs)), sorted_drugs, ha='right', rotation=45, fontsize=25)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xticks([])
plt.tight_layout()
plt.legend(loc='upper left', bbox_to_anchor=(1, 1.05), title = 'Chemotherapeutics', fontsize=30, title_fontsize=30, frameon=False)

# Define the TPM Histogram graph file path for graph storing
Peripheral_graph_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Peripheral_Graph.svg")
plt.savefig(Peripheral_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Peripheral_graph_path}")
plt.tight_layout()

plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Peripheral_Graph.svg
No description has been provided for this image
InĀ [103]:
# Core Genes to set
Core_Genes = set(Core_Genes)

# Create a set to store the values
category_all_values = set()

# Create a set to store the overlapping values
overlapping_values = set()

for values in category_dfs_select.values():
    for value in values:
        if value in category_all_values:
            overlapping_values.add(value)
        else:
            category_all_values.add(value)

category_all_values = set(sorted(category_all_values))

# Check if there are overlapping values
if overlapping_values:
    print("There are overlapping values between Category.")
    print("Overlapping values:", overlapping_values)
else:
    print("There are no overlapping values between keys.")

# Create a set to store the values
peripheral_all_values = set()

# Create a set to store the overlapping values
overlapping_values = set()

for values in peripheral_dfs_select.values():
    for value in values:
        if value in peripheral_all_values:
            overlapping_values.add(value)
        else:
            peripheral_all_values.add(value)

peripheral_all_values = set(sorted(peripheral_all_values))

# Check if there are overlapping values
if overlapping_values:
    print("There are overlapping values between Peripheral.")
    print("Overlapping values:", overlapping_values)
else:
    print("There are no overlapping values between keys.")
There are no overlapping values between keys.
There are no overlapping values between keys.
InĀ [104]:
# Initialize dictionaries to store overlapping and non-overlapping genes
filtered_rest_genes_dict = {}
overlapping_genes_dict = {}

# Iterate through the keys and values in Number_drugs_dict
for key, values in Number_drugs_dict.items():
    # Find the intersection of the values with all three sets
    overlapping_values = set(values) & (set(Core_Genes) | set(category_all_values) | set(peripheral_all_values))
    
    # Find non-overlapping values
    non_overlapping_values = set(values) - overlapping_values
    
    # Check if there are overlapping values and non-overlapping values
    if overlapping_values:
        overlapping_genes_dict[key] = list(overlapping_values)
    
    if non_overlapping_values:
        filtered_rest_genes_dict[key] = list(non_overlapping_values)
InĀ [105]:
items_per_key = {key: len(value) for key, value in filtered_rest_genes_dict.items()}
# Print the items per key
print(items_per_key)

multidrug_genes = set()
non_respondent_genes = set()

for key, values in filtered_rest_genes_dict.items():
    if key >= 2:
        multidrug_genes.update(values)
    if key <= 1:
        non_respondent_genes.update(values)
{np.int64(0): 2849, np.int64(2): 3117, np.int64(3): 1877, np.int64(4): 953, np.int64(5): 414, np.int64(6): 194, np.int64(7): 1}
InĀ [106]:
from venn import venn
genes = {
    "Core" : Core_Genes,
    "Category": category_all_values,
    "Peripheral": peripheral_all_values,
    "Multidrug": multidrug_genes,
    "Non-Respondent": non_respondent_genes 
}

venn(genes, fontsize=50, legend_loc="upper right", cmap="viridis", figsize=(25,25))
fig = plt.gcf()

# --- REMOVE BOX FROM LEGEND & SET LEGEND TITLE ---
# Get the current axes object from the figure
ax = fig.gca()
# Get the legend object attached to these axes
legend = ax.get_legend()

if legend: # Check if a legend was actually created by the 'venn' function
    legend.set_frame_on(False) # Remove the box (frame) around the legend
    legend.set_title("Categorization") # Set the desired legend title
    legend.get_title().set_fontsize(50) # Example font size for title
    legend.get_title().set_fontweight('bold') # Make the title bold
    for text_obj in legend.get_texts()[0:6]:
        text_obj.set_fontsize(40)
    
# for patch in ax.patches:
#     patch.set_alpha(0.7)

# set_names = list(genes.keys())
# single_set_keys = []
# for i in range(len(set_names)):
#     # Create a bitmask for only the i-th set
#     bitmask = '0' * i + '1' + '0' * (len(set_names) - 1 - i)
#     single_set_keys.append(bitmask)

# print(f"Single set keys to look for: {single_set_keys}")

# Display the diagram

# Define the TPM Histogram graph file path for graph storing
Venn5_graph_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Venn5_Graph.svg")
plt.savefig(Venn5_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Venn5_graph_path}")
plt.tight_layout()

plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Venn5_Graph.svg
No description has been provided for this image

Resetting Log2 Fold Change & P Value dataframe¶

InĀ [107]:
# Calling right dataframe for FC and pval
Final_pval_path = os.path.join(database_files_stats, f"{directory}_all_meanFC_ovp3.xlsx")
Final_pval = pd.read_excel(Final_pval_path)

# Preparation for other indexing than FC or pval data
Final_pval_index = Final_pval.iloc[:, :4]
verified_columns = hORFeome9_1[['entrez_gene_id', 'Verified']]

# Merge the DataFrames on 'NCBI'
verified_index = pd.merge(Final_pval_index, verified_columns, left_on='NCBI', right_on='entrez_gene_id', how='left')
verified_index = verified_index.drop("entrez_gene_id", axis=1)
verified_index['Verified'] = verified_index['Verified'].fillna(0).astype(int)
verified_index = verified_index.set_index(batch_compile_stat_df.index)
verified_index = verified_index.reset_index()

# Get data for calculating silencing effect
silencing = ["mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"]

# Filter the DataFrame based on silencing terms in column names
silencing_df = batch_compile_stat_df.loc[:, batch_compile_stat_df.columns.str.contains('|'.join(silencing))]
silencing_compile_df = silencing_df.copy()

# Subtract the values and store the result in a new column using .loc
silencing_compile_df.loc[:, 'Silencing Ratio(log)'] = silencing_df['63-mCherryPositive&BFPNegative'] - silencing_df['64-mCherryNegative&BFPNegative']

# Define a function to determine silencing status based on the ratio
def get_silencing_status(ratio):
    if np.isnan(ratio):
        return 'Not Found'
    elif ratio >= -3:
        return 'No Silence'
    else:
        return 'Silenced'

# Apply the function to create the "Silencing" column
silencing_compile_df['Silencing'] = silencing_compile_df['Silencing Ratio(log)'].apply(get_silencing_status)

silencing_compile_df = silencing_compile_df.reset_index()

# Merge the "Silencing Ratio(log)" column from silencing_compile_df to verified_index
indexing_df = pd.merge(verified_index, silencing_compile_df[['index', 'Silencing Ratio(log)', 'Silencing']], on='index')

indexing_df['Drug Number'] = num_drugs_passed_filter.tolist()
indexing_df['Drug Annotation'] = num_drugs_passed_filter_with_drugs.tolist()

# Function to determine the 'Type' based on the conditions
def determine_type(row):
    ncbi_number = row['NCBI']
    
    if ncbi_number in Core_Genes:
        return 'Core'
    for category, values in category_dfs_select.items():
        if ncbi_number in values:
            return category
    for drug, values in peripheral_dfs_select.items():
        if ncbi_number in values:
            return drug
    if ncbi_number in multidrug_genes:
        return 'Multidrug'
    if ncbi_number in non_respondent_genes:
        return 'Non-respondent'
    return None  # If not found in any of the sets

# Apply the function to create the 'Type' column
indexing_df['Type'] = indexing_df.apply(determine_type, axis=1)

# Add GO Count and Pubmed Count
GO_Count = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/Addition/01_gene_numgo_exp.xlsx")
Pudmed_Count = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/Addition//01_gene_numpub.xlsx")
# GO_Count.rename(columns={'Entrez ID': 'NCBI', 'Symbol': 'Gene_Symbol', '# exp GO': 'GO Count'}, inplace=True)
# Pudmed_Count.rename(columns={'Entrez ID': 'NCBI', 'Symbol': 'Gene_Symbol', '# Pubmed': 'Pubmed Count'}, inplace=True)
indexing_df = indexing_df.merge(GO_Count[['Entrez ID', '# exp GO']], left_on='NCBI', right_on='Entrez ID', how='left')
indexing_df.rename(columns={'# exp GO': 'GO Count'}, inplace=True)
indexing_df.drop('Entrez ID', axis=1, inplace=True)
indexing_df = indexing_df.merge(Pudmed_Count[['Entrez ID', '# Pubmed']], left_on='NCBI', right_on='Entrez ID', how='left')
indexing_df.rename(columns={'# Pubmed': 'Pubmed Count'}, inplace=True)
indexing_df.drop('Entrez ID', axis=1, inplace=True)

# Prepare FC and pval data
Final_pval_data = Final_pval.iloc[:, 4:]
Final_pval_data = Final_pval_data.set_index(batch_compile_stat_df.index)

# Remove columns containing 'nan_filter'
Final_pval_data = Final_pval_data.drop(columns=[col for col in Final_pval_data.columns if 'nan_filter' in col])

# Fill NaN values in 'FC' columns with 0
fc_columns = [col for col in Final_pval_data.columns if 'FC' in col]
Final_pval_data[fc_columns] = Final_pval_data[fc_columns].fillna(0)

# Replace NaN values in 'ovp3' columns with 1
ovp3_columns = [col for col in Final_pval_data.columns if 'ovp3' in col]
Final_pval_data[ovp3_columns] = Final_pval_data[ovp3_columns].fillna(1)

# Replace 'ovp3' with 'pval' in column names
Final_pval_data.columns = [col.replace('ovp3', 'pval') for col in Final_pval_data.columns]

# List to store the ordered column names
ordered_columns = []
ordered_drug_names = []

# Desired order of drug categories
desired_order = ["Antimetabolite", "DNA cross linking agent", "DNA strand break agent", "Microtubule inhibitor"]

# Iterate through the desired order of drug categories
for category in desired_order:
    drugs_in_category = drug_category.get(category, [])
    for drug in drugs_in_category:
        # Add the FC and pval columns for each drug
        ordered_columns.extend([f"FC_{drug}", f"pval_{drug}"])
        # Add the drug to the ordered list
        ordered_drug_names.append(drug)

# Reorder the columns in the DataFrame and reset the index
Final_pval_data = Final_pval_data[ordered_columns]
Final_pval_data = Final_pval_data.reset_index()

# Final merging of both indexing dataframe and data
Final_pval_df = pd.merge(indexing_df, Final_pval_data, on='index')
Final_pval_df.set_index('index', inplace=True)
Final_pval_df.index.name = None

# Saving file 
Final_pval_df_path = os.path.join(database_files_stats, f"{directory}_Final_meanFC_pval.xlsx")
Final_pval_df.to_excel(Final_pval_df_path, index=False)
InĀ [108]:
Resistance_Genes = ['TCF21', 'MXD3', 'BHLHA9', 'JUNB', 'ESX1', 'LHX5', 'HMX2', 'MKRN1', 'SIRT1', 'RPF1', 'BCL2L2', 'BCL2', 'CASP4', 'MTCH1', 'TYMS', 'TYMP', 'UCK2']
Sensitizing_Genes = ['MYC', 'YAF2', 'E2F1', 'TFDP1', 'TEF', 'RNF7', 'SIX3', 'CDK2', 'CDK4', 'CDK6', 'FNTA', 'FGF20', 'FGF5', 'EREG', 'PDXK', 'DEDD2', 'EDAR', 'BCL2L15', 'BAD', 'UBTD1', 'SLC28A1', 'SLC28A2', 'SLC29A3', 'SLC29A4', 'TK1', 'YRDC']
InĀ [110]:
from adjustText import adjust_text

# Calculate the number of rows and columns for subplots
num_drugs = len(ordered_drug_names)
num_cols = 4  # Number of columns for subplots
num_rows = (num_drugs + num_cols - 1) // num_cols  # Calculate the number of rows

# Create subplots
plt.style.use('default')
fig, axes = plt.subplots(num_rows, num_cols, figsize=(8*num_rows, 6*num_rows), facecolor='white')

# Plot the log2 fold change values against the rank index for each drug
for i, drug in enumerate(ordered_drug_names):
    # Calculate the subplot index
    row_idx = i // num_cols
    col_idx = i % num_cols

    # Sort the DataFrame based on the fold change (FC) for the current drug
    sorted_df = Final_pval_df.sort_values(by='FC_' + drug, ascending=False)
    
    # Assign ranks starting from 0 to the highest fold change
    sorted_df['Rank'] = range(len(sorted_df))
    
    # Plot bar plot for fold change against rank index with color from drug_color_map
    # axes[row_idx, col_idx].bar(sorted_df['Rank'], sorted_df['FC_' + drug], label=drug, color=drug_color_map.get(drug, 'skyblue'), edgecolor='none')
    axes[row_idx, col_idx].plot(sorted_df['Rank'], sorted_df['FC_' + drug], label=drug, color=drug_color_map.get(drug, 'skyblue'))
    axes[row_idx, col_idx].set_title(drug)
    axes[row_idx, col_idx].set_xlabel('Rank Index')
    axes[row_idx, col_idx].set_ylabel('Fold Change')
    axes[row_idx, col_idx].legend()
    axes[row_idx, col_idx].grid(False)
    axes[row_idx, col_idx].set_xlim(right=15000)

    highlighted_genes = []  # To store genes that are highlighted
    for gene in sorted_df['Gene_Symbol']:
        if gene in Resistance_Genes:
            resistance_gene = sorted_df[(sorted_df['Gene_Symbol'] == gene) & ((sorted_df['FC_' + drug] > 1) | (sorted_df['FC_' + drug] < -1)) & (sorted_df['pval_' + drug] < 0.05)]
            if not resistance_gene.empty:
                x = resistance_gene['Rank'].iloc[0]  
                y = resistance_gene['FC_' + drug].iloc[0]
                highlighted_genes.append((x, y, gene, 'red'))  # Add highlighted gene info
        elif gene in Sensitizing_Genes:
            sensitizing_gene = sorted_df[(sorted_df['Gene_Symbol'] == gene) & ((sorted_df['FC_' + drug] > 1) | (sorted_df['FC_' + drug] < -1)) & (sorted_df['pval_' + drug] < 0.05)]
            if not sensitizing_gene.empty:
                x = sensitizing_gene['Rank'].iloc[0]  
                y = sensitizing_gene['FC_' + drug].iloc[0]
                highlighted_genes.append((x, y, gene, 'blue'))  # Add highlighted gene info

    # Add annotations with adjust_text to avoid overlap
    texts = [axes[row_idx, col_idx].text(x, y, gene, fontsize=15, color=color) for x, y, gene, color in highlighted_genes]
    adjust_text(texts, ax=axes[row_idx, col_idx], arrowprops=dict(arrowstyle='-', color='grey', lw=1.0))

    # Highlight corresponding dots in red or blue with size 8
    for x, y, _, color in highlighted_genes:
        axes[row_idx, col_idx].scatter(x, y, color=color, s=10)

# Adjust layout
# plt.style.use('default')
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [111]:
# Calculate the number of rows and columns for subplots
num_drugs = len(ordered_drug_names)
num_cols = 4  # Number of columns for subplots
num_rows = (num_drugs + num_cols - 1) // num_cols  # Calculate the number of rows

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(8*num_rows, 6*num_rows))

# Plot the volcano plots for each drug
for i, drug in enumerate(ordered_drug_names):
    # Calculate the subplot index
    row_idx = i // num_cols
    col_idx = i % num_cols

    # Sort the DataFrame based on the fold change (FC) for the current drug
    sorted_df = Final_pval_df.sort_values(by='FC_' + drug, ascending=False)
    
    # Calculate -log10(p-value)
    sorted_df['-log10(p-value)'] = -np.log10(sorted_df['pval_' + drug])
    
    # Plot volcano plot
    axes[row_idx, col_idx].scatter(sorted_df['FC_' + drug], sorted_df['-log10(p-value)'], color='lightgrey', alpha=0.5, s=5)
    axes[row_idx, col_idx].set_title(drug)
    axes[row_idx, col_idx].set_xlabel('Log2 Fold Change')
    axes[row_idx, col_idx].set_ylabel('-log10(p-value)')
    axes[row_idx, col_idx].grid(False)

    # Highlight genes with arrows based on Log2FC and p-value criteria
    highlighted_genes = []  # To store genes that are highlighted
    for gene in sorted_df['Gene_Symbol']:
        if gene in Resistance_Genes:
            resistance_gene = sorted_df[(sorted_df['Gene_Symbol'] == gene) & ((sorted_df['FC_' + drug] > 1) | (sorted_df['FC_' + drug] < -1)) & (sorted_df['pval_' + drug] < 0.05)]
            if not resistance_gene.empty:
                x = resistance_gene['FC_' + drug].iloc[0]
                y = resistance_gene['-log10(p-value)'].iloc[0]
                highlighted_genes.append((x, y, gene, 'red'))  # Add highlighted gene info
        elif gene in Sensitizing_Genes:
            sensitizing_gene = sorted_df[(sorted_df['Gene_Symbol'] == gene) & ((sorted_df['FC_' + drug] > 1) | (sorted_df['FC_' + drug] < -1)) & (sorted_df['pval_' + drug] < 0.05)]
            if not sensitizing_gene.empty:
                x = sensitizing_gene['FC_' + drug].iloc[0]
                y = sensitizing_gene['-log10(p-value)'].iloc[0]
                highlighted_genes.append((x, y, gene, 'blue'))  # Add highlighted gene info

    # Add annotations with adjust_text to avoid overlap
    texts = [axes[row_idx, col_idx].text(x, y, gene, fontsize=15, color=color) for x, y, gene, color in highlighted_genes]
    adjust_text(texts, ax=axes[row_idx, col_idx], arrowprops=dict(arrowstyle='-', color='grey', lw=0.5))
    
    # Highlight corresponding dots in red or blue with size 8
    for x, y, _, color in highlighted_genes:
        axes[row_idx, col_idx].scatter(x, y, color=color, s=10)

# Adjust layout
plt.tight_layout()
plt.show()
No description has been provided for this image

T Score calculation for High Saturaion Re-Testing¶

InĀ [112]:
# Filter out numpy warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)

# Function to assign bin numbers based on T-score ranges
def assign_t_bin(t_score):
    
    t_score_ranges = {
        (4.0, float('inf')): 1,
        (3.3, 4.0): 2,
        (2.5, 3.3): 3,
        (2.0, 2.5): 4,
        (-2.5, 2.0): 5,
        (-3.3, -2.5): 6,
        (-4.0, -3.3): 7,
        (-float('inf'), -4.0): 8
    }

    for score_range, bin_number in t_score_ranges.items():
        if score_range[0] < t_score <= score_range[1]:
            return bin_number
    return 0

# Create a new DataFrame to store T scores
t_score_df = pd.DataFrame()
t_score_df['Verified'] = Final_pval_df['Verified']

# Initialize an empty list to store T value DataFrames for each drug
t_value_dfs = []

control_samples, experimental_samples = separate_batches(updated_divided_triplets)
# Print the results
for i, (control_sample_name, control_sample_data), (experimental_sample_name, experimental_sample_data) in zip(
        range(1, len(control_samples) + 1), control_samples, experimental_samples):
    for exp_sample in experimental_sample_data:
        drug_name = extract_name(exp_sample)

        # Access the DataFrame using the individual column names
        tested_columns = np.array(batch_compile_stat_df[control_sample_data + exp_sample])
        tested_columns = np.array([[float(val) if val != 'nan' else np.nan for val in row] for row in tested_columns],
                                dtype=float)
        [row_num, col_num] = tested_columns.shape
        treat_num = len(exp_sample)
        control_num = len(control_sample_data)

        t = np.zeros([row_num, Iter_num])
        w = np.zeros([row_num, Iter_num])
        mdiff = np.zeros([row_num, Iter_num])
        wz = np.zeros([row_num, Iter_num])

        x = tested_columns[:, control_num:]  # treatment group
        y = tested_columns[:, :control_num]  # control group

        [t, w, mdiff, wz] = stat_uneq(x, y)

        # Create a DataFrame for the T values for this drug
        t_value_df = pd.DataFrame({f'T_value_{drug_name}': t}, index=batch_compile_stat_df.index)

        # Apply the assign_t_bin function to create the 'Bin' column for this drug
        t_value_df[f'Bin_{drug_name}'] = t_value_df[f'T_value_{drug_name}'].apply(assign_t_bin)

        # Check if 'Verified' is equal to 0
        t_value_df[f'Bin_{drug_name}'] = np.where(t_score_df['Verified'] == 0, 0, t_value_df[f'Bin_{drug_name}'])

        # Append the DataFrame to the list of T value DataFrames
        t_value_dfs.append(t_value_df)

# Concatenate the list of T value DataFrames into a single DataFrame
t_score_df = pd.concat([t_score_df] + t_value_dfs, axis=1)
t_score_df = t_score_df.drop(['Verified'], axis =1)

# Initialize the "Other" column as an empty string
t_score_df['Other'] = ''

# Loop through each row to calculate the category for each gene
for index, row in t_score_df.iterrows():
    all_below_threshold = True

    # Loop through the drug names and check their T_value
    for drug_name in drug_name_list:
        t_value_column = f'T_value_{drug_name}'
        t_value = row[t_value_column]

        if not pd.isna(t_value) and abs(t_value) >= 2:
            all_below_threshold = False
            break  # Exit the loop if any T_value is above or equal to 2

    if all_below_threshold:
        t_score_df.at[index, 'Other'] = 'Control'
    else:
        counts = {
            'Peripheral': 0,
            'Core': 0,
        }

        # Loop through the drug names and check their T_value
        for drug_name in drug_name_list:
            t_value_column = f'T_value_{drug_name}'
            t_value = row[t_value_column]

            if not pd.isna(t_value):
                if abs(t_value) > 4:
                    counts['Peripheral'] += 1
                if abs(t_value) > 2:
                    counts['Core'] += 1

        if counts['Core'] >= 8:
            t_score_df.at[index, 'Other'] = 'Core'
        elif counts['Peripheral'] >= 1:
            t_score_df.at[index, 'Other'] = 'Peripheral'

# After the code, you can reset the warnings to their default behavior
Final_pval_Index = Final_pval_df.iloc[:, :7]
Final_pval_Index = Final_pval_Index.reset_index()
t_score_df = t_score_df.reset_index()

t_score_compiled_df = pd.merge(Final_pval_Index, t_score_df, on = 'index')
t_score_compiled_df.set_index('index', inplace =True)
t_score_compiled_df.index.name = None

t_score_compiled_path = os.path.join(database_files_stats, f"{directory}_T_score.xlsx")
t_score_compiled_df.to_excel(t_score_compiled_path, index=False)

warnings.resetwarnings()
InĀ [113]:
t_testing_path = os.path.join(database_files_stats, f"{directory}_T_score_testing.xlsx")
t_testing_df = pd.read_excel(t_testing_path) 
InĀ [114]:
# Create a new column to store the match result
t_testing_df['Match'] = False

# Iterate through each row in the DataFrame
for index, row in t_testing_df.iterrows():
    previous_drug = row['Previous Drug']
    previous_number = row['Previous number']
    other = row['Other']
    
    for column_name in t_testing_df.columns:
        if column_name.startswith('Bin_'):
            drug_name = column_name.split('_')[1]
            current_number = row[column_name]
            
            # Check if the "Previous number" and "Bin number" are both within the specified ranges
            if previous_drug == drug_name and (1 <= previous_number <= 4) and (1 <= current_number <= 4):
                # Both in the positive range, set 'Match' to True
                t_testing_df.at[index, 'Match'] = True
            elif previous_drug == drug_name and (5 <= previous_number <= 8) and (5 <= current_number <= 8):
                # Both in the negative range, set 'Match' to True
                t_testing_df.at[index, 'Match'] = True
            if previous_drug == other:
                t_testing_df.at[index, 'Match'] = True

t_testing_compiled_path = os.path.join(database_files_stats, f"{directory}_T_score_testing_match2.xlsx")
t_testing_df.to_excel(t_testing_compiled_path, index=False)

Z-Score evaluation¶

InĀ [115]:
# Prepare datafarme with only baseline and DMSO
finding_zscore = pd.read_excel(os.path.join(database_files_stats, f"{directory}_all_meanFC_allpval.xlsx"))
zscore_df = finding_zscore[baseline_dmso_columns]

# Prepare 4 columns division for the 
zscore_divide = [baseline_dmso_columns[i:i + 4] for i in range(0, len(baseline_dmso_columns), 4)]

# Initialize a dictionary to store the differences for each batch
zscore_data_dict = {}

# Calculate the differences between Baseline and DMSO for each set
for i, set_columns in enumerate(zscore_divide):
    baseline_column = set_columns[0]
    dmso_columns = set_columns[1:]
    
    # Calculate the mean for each DMSO group
    mean_dmso = zscore_df[dmso_columns].mean(axis=1)
    
    # Subtract the mean of DMSO from Baseline for each set
    differences = mean_dmso - zscore_df[baseline_column]
    
    # Store the differences for this batch
    batch_name = f'Batch {i + 1}'
    zscore_data_dict[batch_name] = differences

# Create a DataFrame from the differences dictionary
zscore_data_df = pd.DataFrame(zscore_data_dict)
# Convert log2FC to Z score by columns 
zscore_data_df = Zscore(zscore_data_df)

# Calculate variance of Z scores for each batch
variances = zscore_data_df.var()
# Calculate weights as the inverse of the variances
weights = 1 / variances
# Normalize the weights to sum up to 1
weights_normalized = weights / sum(weights)
# Converting normalized weight to array 
weights_normalized_array = weights_normalized.values

# Set up the Combining Z score methods 
Chosen_combining_methods = "Z_score_transformation"  # Change this to the desired method
# Assign following: "Z_score", "Meta-Analysis", "Stouffer", 
# "Fisher", "Z_score_Average", "Z_score_transformation", "Inverse_variance"

if Chosen_combining_methods == "Z_score":
    ## Z score at each row - Null hypothesis testing between each gene's Z score of batches
    # Calculate sum, mean, standard deviation for each genes (row)
    mean_per_row = zscore_data_df.mean(axis=1)
    std_per_row = zscore_data_df.std(axis=1)
    sum_per_row = np.sum(zscore_data_df, axis=1)

    # Combined Z score transformation for row
    combined_Z_scores = (sum_per_row - mean_per_row) / std_per_row

elif Chosen_combining_methods == "Meta-Analysis":
    ## Meta-Analysis with Random Effects Model
    # Change datafarme to arrays 
    z_score_val = zscore_data_df.values

    # Calculate weighted Z-scores and total weight
    weighted_z_scores = z_score_val * np.sqrt(1 / weights_normalized_array)
    total_weight = np.sum(1 / weights_normalized_array)

    combined_Z_scores = np.sum(weighted_z_scores, axis=1) / total_weight

elif Chosen_combining_methods == "Stouffer":
    ## Stouffer's Z-score method
    # Initialize an array to store the combined Z scores for each sample
    combined_Z_scores = np.zeros(zscore_data_df.shape[0])

    # Iterate over each sample (row) and calculate the combined Z score
    for i in range(zscore_data_df.shape[0]):
        numerator = np.sum(zscore_data_df.iloc[i] * weights_normalized)
        denominator = np.sqrt(np.sum(weights_normalized))
        combined_Z_scores[i] = numerator / denominator

elif Chosen_combining_methods == "Z_score_Average":
    ## Combining Z Scores by Weighted Average
    # Calculate the numerator: weighted sum of Z scores for each row
    numerator = (zscore_data_df.values * weights_normalized_array).sum(axis=1)
    # Calculate the denominator: square root of the sum of weights
    denominator = np.sqrt(sum(weights))
    # Calculate the combined Z score for each row using Stouffer's Z-score method
    combined_Z_scores = numerator / denominator

elif Chosen_combining_methods == "Fisher":
    # Fisher's method
    from scipy.stats import norm, chi2

    # Calculate the chi-squared statistic using Fisher's method
    # Fisher's method: X^2 = -2 * sum(log(p_values))
    # Since Z scores are used, we'll square them to get the p-values (two-tailed test)
    # p_values = 2 * norm.cdf(-abs(Z_scores))
    pscore_data_df = zscore_data_df.copy()

    p_values = 2 * norm.cdf(-np.abs(pscore_data_df))

    # Calculate the chi-squared statistic using Fisher's method
    combined_chi_squared = -2 * np.sum(np.log(p_values), axis=1)

    # Degrees of freedom = 2 * number of batches
    df = 2 * pscore_data_df.shape[1]

    # Calculate the p-value using the chi-squared distribution
    combined_p_value = 1 - chi2.cdf(combined_chi_squared, df)

elif Chosen_combining_methods == "Z_score_transformation":
    ## Z Score Transformation followed by Averaging
    # Calculate mean and standard deviation for each row
    mean_per_row = zscore_data_df.mean(axis=1)
    std_per_row = zscore_data_df.std(axis=1)

    # Inverse Z Score Transformation
    # Calculate the inverse Z score transformation for each column
    inverse_transformed_values = zscore_data_df.apply(lambda col: col * std_per_row + mean_per_row , axis=0)

    # Calculate the combined Z score for each row by averaging the inverse transformed values
    combined_Z_scores = inverse_transformed_values.mean(axis=1)

elif Chosen_combining_methods == "Inverse_variance":
    ## Inverse Variance Weighting
    # Assume standard error (SE) for Z scores is 1
    SE = np.ones_like(zscore_data_df.values)

    # Calculate the inverse variance weights
    inverse_variance_weights = 1 / (SE ** 2)

    # Calculate the numerator and denominator for the combined Z score
    numerator = np.sum(zscore_data_df / SE, axis=1)
    denominator = np.sum(inverse_variance_weights, axis=1)

    # Calculate the combined Z score for each row
    combined_Z_scores_inverse_variance = numerator / denominator
    combined_Z_scores = combined_Z_scores_inverse_variance
else:
    print("The combined Z method is not set")

# Align Calculated Combined Z score into dataframe
zscore_data_df['Combined_Z_Score'] = combined_Z_scores


# Assign list for headers
all_header = zscore_data_df.columns.to_list()
# # If any of batches have NaN value, change combined Z score to NaN
# zscore_data_df.loc[zscore_data_df[all_header].isnull().any(axis=1), 'Combined_Z_Score'] = np.nan

# Saving initial Z score dataframe
zscore_data_path = os.path.join(database_files_stats, f"{directory}_zscore.xlsx")
finding_zscore_index = finding_zscore.iloc[:, :4]
zscore_data_compile_df = pd.concat([finding_zscore_index, zscore_data_df], axis=1)
zscore_data_compile_df.to_excel(zscore_data_path, index=False)
# zscore_data_compile_df_sorted = zscore_data_compile_df.sort_values(
#     by='Combined_Z_Score',
#     ascending=False)
# zscore_data_compile_df = save_dataframe(fasta_data, zscore_data_df, zscore_data_path)

# Identify batch columns
batch_cols = [col for col in zscore_data_compile_df.columns if col.startswith('Batch')]

# Fill any remaining NaNs in Combined_Z_Score with the first available non-NaN from batch_cols
zscore_data_compile_df['Combined_Z_Score'] = zscore_data_compile_df.apply(
    lambda row: next((row[col] for col in batch_cols if not pd.isna(row[col])), np.nan)
    if pd.isna(row['Combined_Z_Score']) else row['Combined_Z_Score'],
    axis=1
)

# Count non-NaN batch values per row
zscore_data_compile_df['NonNaN_Batch_Count'] = zscore_data_compile_df[batch_cols].notna().sum(axis=1)

# Sort by number of valid batches descending, then Combined_Z_Score descending
zscore_data_compile_df_sorted = zscore_data_compile_df.sort_values(
    by=['NonNaN_Batch_Count', 'Combined_Z_Score'],
    ascending=[False, False]
)

zscore_data_compile_df_sorted_negative = zscore_data_compile_df.sort_values(
    by=['NonNaN_Batch_Count', 'Combined_Z_Score'],
    ascending=[False, True]
)

# Drop helper column
zscore_data_compile_df_sorted = zscore_data_compile_df_sorted.drop(columns=['NonNaN_Batch_Count'])
zscore_data_compile_df_sorted_negative = zscore_data_compile_df_sorted_negative.drop(columns=['NonNaN_Batch_Count'])

PR Curve¶

InĀ [116]:
# Induce dataframe from TCGA
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)

zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#favorable', '#unfavorable']], left_on='NCBI', right_on='Entrez', how='left')

# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)

# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()

# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#favorable', '#unfavorable', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')

# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)

# Replace NaN values in #favorable_x with values from #Cervical_Favor_y if NaN
zscore_merge_df['#favorable_x'] = zscore_merge_df['#favorable_x'].fillna(zscore_merge_df['#favorable_y'])

# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#unfavorable_x'] = zscore_merge_df['#unfavorable_x'].fillna(zscore_merge_df['#unfavorable_y'])

# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#favorable_y', '#unfavorable_y'], axis=1, inplace=True)

# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#favorable_x': '#favorable', '#unfavorable_x': '#unfavorable'}, inplace=True)

# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#unfavorable'] >= 1 else 0, axis=1)

# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()

# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
    zscore_merge_df['Precision'] = 0
    zscore_merge_df['Recall'] = 0
    zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
    zscore_merge_df['Rank'] = None
    zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)

    # Initialize variables
    precision_values = []
    recall_values = []
    precision_cumulative_hit = 0
    recall_cumulative_hit = 0
    hit_count = zscore_merge_df['Hit'].sum()

    # Compute Precision and Recall
    for idx, row in zscore_merge_df.iterrows():
        if idx < first_hit_index:
            precision_values.append(None)
            recall_values.append(None)
            continue

        precision_cumulative_hit += row['Hit']
        recall_cumulative_hit += row['Hit']

        precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
        recall = recall_cumulative_hit / hit_count * 100

        precision_values.append(precision)
        recall_values.append(recall)

    zscore_merge_df['Precision'] = precision_values
    zscore_merge_df['Recall'] = recall_values

    # Calculate cumulative max of precision
    zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_Unfavor_old.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)

Old_HPA_unfavorable_df = zscore_merge_df.copy()

# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']

# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()

# Print F1 score where precision ā‰ˆ recall
print(f"F1 score where Precision ā‰ˆ Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")

# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))

color_prec = '#dc143cff'
color_recall = '#003e98ff'

# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA Unfavour PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()

# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')

# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')

# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_old_HPA_Unfavor_PR.svg")
plt.savefig(PR_path, format='svg', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")

plt.show()
F1 score where Precision ā‰ˆ Recall: 36.92
Corresponding Rank: 5249
RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_old_HPA_Unfavor_PR.svg
No description has been provided for this image
InĀ [117]:
Old_HPA_unfavorable_df
Out[117]:
ORF_ID NCBI Group Gene_Symbol Batch 1 Batch 2 Batch 3 Batch 4 Batch 5 Combined_Z_Score #favorable #unfavorable Hit Rank Precision Recall Max(Precision)
0 53066 7691 G04 ZNF132 -1.351889 5.791197 4.831242 1.339717 1.311387 9.314892 2.0 0.0 0 None NaN NaN NaN
1 100000022 84133 G08 ZNRF3 2.472490 -1.661915 4.844651 4.483725 1.916883 8.681986 0.0 0.0 0 None NaN NaN NaN
2 100010472 6051 G04 RNPEP 3.737812 1.740719 6.325549 -1.607528 -0.132890 8.326448 0.0 0.0 0 None NaN NaN NaN
3 5102 55214 G05 LEPREL1 -0.889157 0.244954 4.972440 4.312032 1.626191 7.261316 0.0 0.0 0 None NaN NaN NaN
4 9924 22795 G06 NID2 2.166479 -0.034532 5.174375 3.790967 -0.653718 7.253175 0.0 2.0 1 1 100.000000 0.019238 100.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14525 53867 5290 G06 PIK3CA 0.109405 NaN NaN NaN NaN 0.109405 0.0 0.0 0 14522 35.780196 99.961524 35.786575
14526 10422 7786 G02 MAP3K12 -0.360605 NaN NaN NaN NaN -0.360605 1.0 1.0 1 14523 35.784618 99.980762 35.786575
14527 71971 3747 G08 KCNC2 NaN NaN NaN NaN -1.099003 -1.099003 0.0 0.0 0 14524 35.782154 99.980762 35.786575
14528 7993 6170 G02 RPL39 NaN NaN NaN NaN -2.406590 -2.406590 0.0 1.0 1 14525 35.786575 100.000000 35.786575
14529 100015186 84435 G08 GPR123 NaN NaN NaN NaN NaN NaN 0.0 0.0 0 14526 35.784111 100.000000 35.784111

14530 rows Ɨ 17 columns

InĀ [118]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)

zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#favorable', '#unfavorable']], left_on='NCBI', right_on='Entrez', how='left')

# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)

# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()

# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#favorable', '#unfavorable', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')

# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)

# Replace NaN values in #favorable_x with values from #Cervical_Favor_y if NaN
zscore_merge_df['#favorable_x'] = zscore_merge_df['#favorable_x'].fillna(zscore_merge_df['#favorable_y'])

# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#unfavorable_x'] = zscore_merge_df['#unfavorable_x'].fillna(zscore_merge_df['#unfavorable_y'])

# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#favorable_y', '#unfavorable_y'], axis=1, inplace=True)

# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#favorable_x': '#favorable', '#unfavorable_x': '#unfavorable'}, inplace=True)

# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#unfavorable'] >= 1 else 0, axis=1)

# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()

# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
    zscore_merge_df['Precision'] = 0
    zscore_merge_df['Recall'] = 0
    zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
    zscore_merge_df['Rank'] = None
    zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)

    # Initialize variables
    precision_values = []
    recall_values = []
    precision_cumulative_hit = 0
    recall_cumulative_hit = 0
    hit_count = zscore_merge_df['Hit'].sum()

    # Compute Precision and Recall
    for idx, row in zscore_merge_df.iterrows():
        if idx < first_hit_index:
            precision_values.append(None)
            recall_values.append(None)
            continue

        precision_cumulative_hit += row['Hit']
        recall_cumulative_hit += row['Hit']

        precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
        recall = recall_cumulative_hit / hit_count * 100

        precision_values.append(precision)
        recall_values.append(recall)

    zscore_merge_df['Precision'] = precision_values
    zscore_merge_df['Recall'] = recall_values

    # Calculate cumulative max of precision
    zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_Unfavor_new.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)

New_HPA_unfavorable_df = zscore_merge_df.copy()

# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']

# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()

# Print F1 score where precision ā‰ˆ recall
print(f"F1 score where Precision ā‰ˆ Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")

# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))

color_prec = '#dc143cff'
color_recall = '#003e98ff'

# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA Unfavor PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()

# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')

# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')

# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_new_HPA_Unfavor.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")

plt.show()
F1 score where Precision ā‰ˆ Recall: 42.74
Corresponding Rank: 6080
RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_new_HPA_Unfavor.svg
No description has been provided for this image
InĀ [119]:
New_HPA_unfavorable_df
Out[119]:
ORF_ID NCBI Group Gene_Symbol Batch 1 Batch 2 Batch 3 Batch 4 Batch 5 Combined_Z_Score #favorable #unfavorable Hit Rank Precision Recall Max(Precision)
0 53066 7691 G04 ZNF132 -1.351889 5.791197 4.831242 1.339717 1.311387 9.314892 1.0 0.0 0 None NaN NaN NaN
1 100000022 84133 G08 ZNRF3 2.472490 -1.661915 4.844651 4.483725 1.916883 8.681986 2.0 0.0 0 None NaN NaN NaN
2 100010472 6051 G04 RNPEP 3.737812 1.740719 6.325549 -1.607528 -0.132890 8.326448 1.0 0.0 0 None NaN NaN NaN
3 5102 55214 G05 LEPREL1 -0.889157 0.244954 4.972440 4.312032 1.626191 7.261316 0.0 1.0 1 1 100.000000 0.016472 100.000000
4 9924 22795 G06 NID2 2.166479 -0.034532 5.174375 3.790967 -0.653718 7.253175 0.0 2.0 1 2 100.000000 0.032944 100.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14502 53867 5290 G06 PIK3CA 0.109405 NaN NaN NaN NaN 0.109405 1.0 0.0 0 14500 41.855172 99.967056 41.860305
14503 10422 7786 G02 MAP3K12 -0.360605 NaN NaN NaN NaN -0.360605 1.0 1.0 1 14501 41.859182 99.983528 41.860305
14504 71971 3747 G08 KCNC2 NaN NaN NaN NaN -1.099003 -1.099003 0.0 0.0 0 14502 41.856296 99.983528 41.860305
14505 7993 6170 G02 RPL39 NaN NaN NaN NaN -2.406590 -2.406590 1.0 3.0 1 14503 41.860305 100.000000 41.860305
14506 100015186 84435 G08 GPR123 NaN NaN NaN NaN NaN NaN 1.0 0.0 0 14504 41.857419 100.000000 41.857419

14507 rows Ɨ 17 columns

InĀ [120]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)

# zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#favorable', '#unfavorable']], left_on='NCBI', right_on='Entrez', how='left')
# zscore_merge_df = zscore_merge_df.sort_values(by=['Combined_Z_Score'], ascending=True)
zscore_merge_df = pd.merge(zscore_data_compile_df_sorted_negative, TCGA_data[['Entrez', '#favorable', '#unfavorable']], left_on='NCBI', right_on='Entrez', how='left')

# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)

# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()

# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#favorable', '#unfavorable', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')

# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)

# Replace NaN values in #favorable_x with values from #Cervical_Favor_y if NaN
zscore_merge_df['#favorable_x'] = zscore_merge_df['#favorable_x'].fillna(zscore_merge_df['#favorable_y'])

# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#unfavorable_x'] = zscore_merge_df['#unfavorable_x'].fillna(zscore_merge_df['#unfavorable_y'])

# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#favorable_y', '#unfavorable_y'], axis=1, inplace=True)

# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#favorable_x': '#favorable', '#unfavorable_x': '#unfavorable'}, inplace=True)

# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#favorable'] >= 1 else 0, axis=1)

# zscore_merge_df = zscore_merge_df.iloc[2:, :]

# Calculate the total number of rows in the DataFrame
total_rows = len(zscore_merge_df)

# # Add a new column 'Rank' with values from 1 to total_rows
# zscore_merge_df['Rank'] = range(1, total_rows + 1)

# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()

# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
    zscore_merge_df['Precision'] = 0
    zscore_merge_df['Recall'] = 0
    zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
    zscore_merge_df['Rank'] = None
    zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)

    # Initialize variables
    precision_values = []
    recall_values = []
    precision_cumulative_hit = 0
    recall_cumulative_hit = 0
    hit_count = zscore_merge_df['Hit'].sum()

    # Compute Precision and Recall
    for idx, row in zscore_merge_df.iterrows():
        if idx < first_hit_index:
            precision_values.append(None)
            recall_values.append(None)
            continue

        precision_cumulative_hit += row['Hit']
        recall_cumulative_hit += row['Hit']

        precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
        recall = recall_cumulative_hit / hit_count * 100

        precision_values.append(precision)
        recall_values.append(recall)

    zscore_merge_df['Precision'] = precision_values
    zscore_merge_df['Recall'] = recall_values

    # Calculate cumulative max of precision
    zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_favor.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)

HPA_new_favorable_df = zscore_merge_df.copy()

# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']

# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()

# Print F1 score where precision ā‰ˆ recall
print(f"F1 score where Precision ā‰ˆ Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")

# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))

color_prec = '#dc143cff'
color_recall = '#003e98ff'

# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA favorable PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()

# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')

# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')

# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_new_HPA_favorable_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")

plt.show()
F1 score where Precision ā‰ˆ Recall: 46.48
Corresponding Rank: 6991
RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_new_HPA_favorable_PR.svg
No description has been provided for this image
InĀ [121]:
HPA_new_favorable_df
Out[121]:
ORF_ID NCBI Group Gene_Symbol Batch 1 Batch 2 Batch 3 Batch 4 Batch 5 Combined_Z_Score #favorable #unfavorable Hit Rank Precision Recall Max(Precision)
0 100067281 2294 G09 FOXF1 -4.020519 -6.371358 -0.996546 -3.861530 -0.269732 -10.795805 2.0 0.0 1 1 100.000000 0.014830 100.000000
1 5274 3142 G04 HLX -3.900398 -0.470447 -0.633137 -2.842160 -6.190604 -9.514313 2.0 0.0 1 2 100.000000 0.029660 100.000000
2 52970 79755 G02 ZNF750 -5.630888 -3.813437 0.297178 -2.508267 0.559808 -8.123800 1.0 1.0 1 3 100.000000 0.044491 100.000000
3 5068 5452 G05 POU2F2 -1.243005 -4.496343 -1.603274 0.259306 -5.186843 -8.102555 0.0 0.0 0 4 75.000000 0.044491 75.000000
4 100080502 342371 delta ATXN1L -5.561001 -1.584375 -2.546113 0.103948 -2.041524 -7.127541 0.0 0.0 0 5 60.000000 0.044491 60.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14502 100068241 57482 G01 KIAA1211 NaN 0.550111 NaN NaN NaN 0.550111 1.0 0.0 1 14503 46.473143 99.955509 46.481009
14503 100000034 23774 G08 BRD1 NaN 1.311774 NaN NaN NaN 1.311774 2.0 0.0 1 14504 46.476834 99.970340 46.481009
14504 100080188 131149 delta OTOL1 NaN 1.602334 NaN NaN NaN 1.602334 0.0 0.0 0 14505 46.473630 99.970340 46.481009
14505 1027 4753 G01 NELL2 NaN 2.654557 NaN NaN NaN 2.654557 1.0 0.0 1 14506 46.477320 99.985170 46.481009
14506 100015186 84435 G08 GPR123 NaN NaN NaN NaN NaN NaN 1.0 0.0 1 14507 46.481009 100.000000 46.481009

14507 rows Ɨ 17 columns

InĀ [122]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)

zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor']], left_on='NCBI', right_on='Entrez', how='left')

# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)

# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()

# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')

# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)

# Replace NaN values in #favorable_x with values from #favorable_y if NaN
zscore_merge_df['#Cervical_Favor_x'] = zscore_merge_df['#Cervical_Favor_x'].fillna(zscore_merge_df['#Cervical_Favor_y'])

# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#Cervical_Unfavor_x'] = zscore_merge_df['#Cervical_Unfavor_x'].fillna(zscore_merge_df['#Cervical_Unfavor_y'])

# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#Cervical_Favor_y', '#Cervical_Unfavor_y'], axis=1, inplace=True)

# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#Cervical_Favor_x': '#Cervical_Favor', '#Cervical_Unfavor_x': '#Cervical_Unfavor'}, inplace=True)

# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#Cervical_Unfavor'] >= 1 else 0, axis=1)

# zscore_merge_df = zscore_merge_df.iloc[2:, :]

# Calculate the total number of rows in the DataFrame
total_rows = len(zscore_merge_df)

# # Add a new column 'Rank' with values from 1 to total_rows
# zscore_merge_df['Rank'] = range(1, total_rows + 1)

# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()

# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
    zscore_merge_df['Precision'] = 0
    zscore_merge_df['Recall'] = 0
    zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
    zscore_merge_df['Rank'] = None
    zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)

    # Initialize variables
    precision_values = []
    recall_values = []
    precision_cumulative_hit = 0
    recall_cumulative_hit = 0
    hit_count = zscore_merge_df['Hit'].sum()

    # Compute Precision and Recall
    for idx, row in zscore_merge_df.iterrows():
        if idx < first_hit_index:
            precision_values.append(None)
            recall_values.append(None)
            continue

        precision_cumulative_hit += row['Hit']
        recall_cumulative_hit += row['Hit']

        precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
        recall = recall_cumulative_hit / hit_count * 100

        precision_values.append(precision)
        recall_values.append(recall)

    zscore_merge_df['Precision'] = precision_values
    zscore_merge_df['Recall'] = recall_values

    # Calculate cumulative max of precision
    zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_Cervical_Unfavor.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)

HPA_Cervical_Unfavor_df = zscore_merge_df.copy()

# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']

# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()

# Print F1 score where precision ā‰ˆ recall
print(f"F1 score where Precision ā‰ˆ Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")

# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))

color_prec = '#dc143cff'
color_recall = '#003e98ff'

# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA Cervical Unfavorable PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()

# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')

# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')

# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_HPA_cervical_Unfavorable_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")

plt.show()
F1 score where Precision ā‰ˆ Recall: 2.61
Corresponding Rank: 396
RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_HPA_cervical_Unfavorable_PR.svg
No description has been provided for this image
InĀ [123]:
HPA_Cervical_Unfavor_df 
Out[123]:
ORF_ID NCBI Group Gene_Symbol Batch 1 Batch 2 Batch 3 Batch 4 Batch 5 Combined_Z_Score #Cervical_Favor #Cervical_Unfavor Hit Rank Precision Recall Max(Precision)
0 53066 7691 G04 ZNF132 -1.351889 5.791197 4.831242 1.339717 1.311387 9.314892 0.0 0.0 0 None NaN NaN NaN
1 100000022 84133 G08 ZNRF3 2.472490 -1.661915 4.844651 4.483725 1.916883 8.681986 0.0 0.0 0 None NaN NaN NaN
2 100010472 6051 G04 RNPEP 3.737812 1.740719 6.325549 -1.607528 -0.132890 8.326448 0.0 0.0 0 None NaN NaN NaN
3 5102 55214 G05 LEPREL1 -0.889157 0.244954 4.972440 4.312032 1.626191 7.261316 0.0 0.0 0 None NaN NaN NaN
4 9924 22795 G06 NID2 2.166479 -0.034532 5.174375 3.790967 -0.653718 7.253175 0.0 1.0 1 1 100.000000 0.326797 100.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14502 53867 5290 G06 PIK3CA 0.109405 NaN NaN NaN NaN 0.109405 0.0 0.0 0 14499 2.110490 100.000000 2.110490
14503 10422 7786 G02 MAP3K12 -0.360605 NaN NaN NaN NaN -0.360605 0.0 0.0 0 14500 2.110345 100.000000 2.110345
14504 71971 3747 G08 KCNC2 NaN NaN NaN NaN -1.099003 -1.099003 0.0 0.0 0 14501 2.110199 100.000000 2.110199
14505 7993 6170 G02 RPL39 NaN NaN NaN NaN -2.406590 -2.406590 1.0 0.0 0 14502 2.110054 100.000000 2.110054
14506 100015186 84435 G08 GPR123 NaN NaN NaN NaN NaN NaN 0.0 0.0 0 14503 2.109908 100.000000 2.109908

14507 rows Ɨ 17 columns

InĀ [124]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)

# zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor']], left_on='NCBI', right_on='Entrez', how='left')
# zscore_merge_df = zscore_merge_df.sort_values(by=['Combined_Z_Score'], ascending=True)
zscore_merge_df = pd.merge(zscore_data_compile_df_sorted_negative, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor']], left_on='NCBI', right_on='Entrez', how='left')

# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)

# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()

# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')

# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)

# Replace NaN values in #favorable_x with values from #favorable_y if NaN
zscore_merge_df['#Cervical_Favor_x'] = zscore_merge_df['#Cervical_Favor_x'].fillna(zscore_merge_df['#Cervical_Favor_y'])

# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#Cervical_Unfavor_x'] = zscore_merge_df['#Cervical_Unfavor_x'].fillna(zscore_merge_df['#Cervical_Unfavor_y'])

# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#Cervical_Favor_y', '#Cervical_Unfavor_y'], axis=1, inplace=True)

# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#Cervical_Favor_x': '#Cervical_Favor', '#Cervical_Unfavor_x': '#Cervical_Unfavor'}, inplace=True)

# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#Cervical_Favor'] >= 1 else 0, axis=1)

# zscore_merge_df = zscore_merge_df.sort_values(by=['Combined_Z_Score'], ascending=True)

# zscore_merge_df = zscore_merge_df.iloc[2:, :]

# Calculate the total number of rows in the DataFrame
total_rows = len(zscore_merge_df)

# # Add a new column 'Rank' with values from 1 to total_rows
# zscore_merge_df['Rank'] = range(1, total_rows + 1)

# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()

# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
    zscore_merge_df['Precision'] = 0
    zscore_merge_df['Recall'] = 0
    zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
    zscore_merge_df['Rank'] = None
    zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)

    # Initialize variables
    precision_values = []
    recall_values = []
    precision_cumulative_hit = 0
    recall_cumulative_hit = 0
    hit_count = zscore_merge_df['Hit'].sum()

    # Compute Precision and Recall
    for idx, row in zscore_merge_df.iterrows():
        if idx < first_hit_index:
            precision_values.append(None)
            recall_values.append(None)
            continue

        precision_cumulative_hit += row['Hit']
        recall_cumulative_hit += row['Hit']

        precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
        recall = recall_cumulative_hit / hit_count * 100

        precision_values.append(precision)
        recall_values.append(recall)

    zscore_merge_df['Precision'] = precision_values
    zscore_merge_df['Recall'] = recall_values

    # Calculate cumulative max of precision
    zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_Cervical_favor.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)

HPA_Cervical_favor_df = zscore_merge_df.copy() 

# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']

# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()

# Print F1 score where precision ā‰ˆ recall
print(f"F1 score where Precision ā‰ˆ Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")

# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))

color_prec = '#dc143cff'
color_recall = '#003e98ff'

# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA Cervical favorable PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()

# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')

# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')

# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_HPA_Cervical_favorable_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")

plt.show()
F1 score where Precision ā‰ˆ Recall: 2.47
Corresponding Rank: 743
RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_HPA_Cervical_favorable_PR.svg
No description has been provided for this image
InĀ [125]:
HPA_Cervical_favor_df
Out[125]:
ORF_ID NCBI Group Gene_Symbol Batch 1 Batch 2 Batch 3 Batch 4 Batch 5 Combined_Z_Score #Cervical_Favor #Cervical_Unfavor Hit Rank Precision Recall Max(Precision)
0 100067281 2294 G09 FOXF1 -4.020519 -6.371358 -0.996546 -3.861530 -0.269732 -10.795805 0.0 0.0 0 None NaN NaN NaN
1 5274 3142 G04 HLX -3.900398 -0.470447 -0.633137 -2.842160 -6.190604 -9.514313 0.0 0.0 0 None NaN NaN NaN
2 52970 79755 G02 ZNF750 -5.630888 -3.813437 0.297178 -2.508267 0.559808 -8.123800 0.0 0.0 0 None NaN NaN NaN
3 5068 5452 G05 POU2F2 -1.243005 -4.496343 -1.603274 0.259306 -5.186843 -8.102555 0.0 0.0 0 None NaN NaN NaN
4 100080502 342371 delta ATXN1L -5.561001 -1.584375 -2.546113 0.103948 -2.041524 -7.127541 0.0 0.0 0 None NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14502 100068241 57482 G01 KIAA1211 NaN 0.550111 NaN NaN NaN 0.550111 0.0 0.0 0 14480 2.182320 100.0 2.182320
14503 100000034 23774 G08 BRD1 NaN 1.311774 NaN NaN NaN 1.311774 0.0 0.0 0 14481 2.182170 100.0 2.182170
14504 100080188 131149 delta OTOL1 NaN 1.602334 NaN NaN NaN 1.602334 0.0 0.0 0 14482 2.182019 100.0 2.182019
14505 1027 4753 G01 NELL2 NaN 2.654557 NaN NaN NaN 2.654557 0.0 0.0 0 14483 2.181868 100.0 2.181868
14506 100015186 84435 G08 GPR123 NaN NaN NaN NaN NaN NaN 0.0 0.0 0 14484 2.181718 100.0 2.181718

14507 rows Ɨ 17 columns

InĀ [126]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)

zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', 'Cisplatin_Resistance', '5-FU_Resistance']], left_on='NCBI', right_on='Entrez', how='left')

# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)

# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()

# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', 'Cisplatin_Resistance', '5-FU_Resistance', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')

# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)

# Replace NaN values in #favorable_x with values from #favorable_y if NaN
zscore_merge_df['Cisplatin_Resistance_x'] = zscore_merge_df['Cisplatin_Resistance_x'].fillna(zscore_merge_df['Cisplatin_Resistance_y'])

# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['5-FU_Resistance_x'] = zscore_merge_df['5-FU_Resistance_x'].fillna(zscore_merge_df['5-FU_Resistance_y'])

# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', 'Cisplatin_Resistance_y', '5-FU_Resistance_y'], axis=1, inplace=True)

# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'Cisplatin_Resistance_x': 'Cisplatin_Resistance', '5-FU_Resistance_x': '5-FU_Resistance'}, inplace=True)

# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['Cisplatin_Resistance'] >= 1 else 0, axis=1)

# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()

# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
    zscore_merge_df['Precision'] = 0
    zscore_merge_df['Recall'] = 0
    zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
    zscore_merge_df['Rank'] = None
    zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)

    # Initialize variables
    precision_values = []
    recall_values = []
    precision_cumulative_hit = 0
    recall_cumulative_hit = 0
    hit_count = zscore_merge_df['Hit'].sum()

    # Compute Precision and Recall
    for idx, row in zscore_merge_df.iterrows():
        if idx < first_hit_index:
            precision_values.append(None)
            recall_values.append(None)
            continue

        precision_cumulative_hit += row['Hit']
        recall_cumulative_hit += row['Hit']

        precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
        recall = recall_cumulative_hit / hit_count * 100

        precision_values.append(precision)
        recall_values.append(recall)

    zscore_merge_df['Precision'] = precision_values
    zscore_merge_df['Recall'] = recall_values

    # Calculate cumulative max of precision
    zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Save output to Excel
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_Cisplatin_Resistance_PR.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)

Cisplatin_Resistance_df = zscore_merge_df.copy()

# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']

# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()

# Print F1 score where precision ā‰ˆ recall
print(f"F1 score where Precision ā‰ˆ Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")

# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))

color_prec = '#dc143cff'
color_recall = '#003e98ff'

# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('Cisplatin Resistance PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()

# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')

# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')

# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_Cisplatin_Resistance_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")

plt.show()
F1 score where Precision ā‰ˆ Recall: 10.94
Corresponding Rank: 1370
RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Cisplatin_Resistance_PR.svg
No description has been provided for this image
InĀ [127]:
Cisplatin_Resistance_df
Out[127]:
ORF_ID NCBI Group Gene_Symbol Batch 1 Batch 2 Batch 3 Batch 4 Batch 5 Combined_Z_Score Cisplatin_Resistance 5-FU_Resistance Hit Rank Precision Recall Max(Precision)
0 53066 7691 G04 ZNF132 -1.351889 5.791197 4.831242 1.339717 1.311387 9.314892 0 0 0 None NaN NaN NaN
1 100000022 84133 G08 ZNRF3 2.472490 -1.661915 4.844651 4.483725 1.916883 8.681986 1 0 1 1 100.000000 0.074516 100.000000
2 100010472 6051 G04 RNPEP 3.737812 1.740719 6.325549 -1.607528 -0.132890 8.326448 0 0 0 2 100.000000 0.074516 100.000000
3 5102 55214 G05 LEPREL1 -0.889157 0.244954 4.972440 4.312032 1.626191 7.261316 0 0 0 3 100.000000 0.074516 100.000000
4 9924 22795 G06 NID2 2.166479 -0.034532 5.174375 3.790967 -0.653718 7.253175 0 0 0 4 100.000000 0.074516 100.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14502 53867 5290 G06 PIK3CA 0.109405 NaN NaN NaN NaN 0.109405 0 0 0 14502 9.253896 100.000000 9.253896
14503 10422 7786 G02 MAP3K12 -0.360605 NaN NaN NaN NaN -0.360605 0 0 0 14503 9.253258 100.000000 9.253258
14504 71971 3747 G08 KCNC2 NaN NaN NaN NaN -1.099003 -1.099003 0 0 0 14504 9.252620 100.000000 9.252620
14505 7993 6170 G02 RPL39 NaN NaN NaN NaN -2.406590 -2.406590 0 0 0 14505 9.251982 100.000000 9.251982
14506 100015186 84435 G08 GPR123 NaN NaN NaN NaN NaN NaN 0 0 0 14506 9.251344 100.000000 9.251344

14507 rows Ɨ 17 columns

InĀ [128]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)

zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', 'Cisplatin_Resistance', '5-FU_Resistance']], left_on='NCBI', right_on='Entrez', how='left')

# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)

# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()

# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', 'Cisplatin_Resistance', '5-FU_Resistance', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')

# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)

# Replace NaN values in #favorable_x with values from #favorable_y if NaN
zscore_merge_df['Cisplatin_Resistance_x'] = zscore_merge_df['Cisplatin_Resistance_x'].fillna(zscore_merge_df['Cisplatin_Resistance_y'])

# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['5-FU_Resistance_x'] = zscore_merge_df['5-FU_Resistance_x'].fillna(zscore_merge_df['5-FU_Resistance_y'])

# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', 'Cisplatin_Resistance_y', '5-FU_Resistance_y'], axis=1, inplace=True)

# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'Cisplatin_Resistance_x': 'Cisplatin_Resistance', '5-FU_Resistance_x': '5-FU_Resistance'}, inplace=True)

# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['5-FU_Resistance'] >= 1 else 0, axis=1)

# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()

# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
    zscore_merge_df['Precision'] = 0
    zscore_merge_df['Recall'] = 0
    zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
    zscore_merge_df['Rank'] = None
    zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)

    # Initialize variables
    precision_values = []
    recall_values = []
    precision_cumulative_hit = 0
    recall_cumulative_hit = 0
    hit_count = zscore_merge_df['Hit'].sum()

    # Compute Precision and Recall
    for idx, row in zscore_merge_df.iterrows():
        if idx < first_hit_index:
            precision_values.append(None)
            recall_values.append(None)
            continue

        precision_cumulative_hit += row['Hit']
        recall_cumulative_hit += row['Hit']

        precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
        recall = recall_cumulative_hit / hit_count * 100

        precision_values.append(precision)
        recall_values.append(recall)

    zscore_merge_df['Precision'] = precision_values
    zscore_merge_df['Recall'] = recall_values

    # Calculate cumulative max of precision
    zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]

# Save output to Excel
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_5-FU_Resistance_PR.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)

FU_Resistance_df = zscore_merge_df.copy()

# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']

# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()

# Print F1 score where precision ā‰ˆ recall
print(f"F1 score where Precision ā‰ˆ Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")

# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))

color_prec = '#dc143cff'
color_recall = '#003e98ff'

# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('5-FU Resistance PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()

# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')

# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')

# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_5-FU_Resistance_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")

plt.show()
F1 score where Precision ā‰ˆ Recall: 8.03
Corresponding Rank: 1061
RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_5-FU_Resistance_PR.svg
No description has been provided for this image
InĀ [129]:
FU_Resistance_df
Out[129]:
ORF_ID NCBI Group Gene_Symbol Batch 1 Batch 2 Batch 3 Batch 4 Batch 5 Combined_Z_Score Cisplatin_Resistance 5-FU_Resistance Hit Rank Precision Recall Max(Precision)
0 53066 7691 G04 ZNF132 -1.351889 5.791197 4.831242 1.339717 1.311387 9.314892 0 0 0 None NaN NaN NaN
1 100000022 84133 G08 ZNRF3 2.472490 -1.661915 4.844651 4.483725 1.916883 8.681986 1 0 0 None NaN NaN NaN
2 100010472 6051 G04 RNPEP 3.737812 1.740719 6.325549 -1.607528 -0.132890 8.326448 0 0 0 None NaN NaN NaN
3 5102 55214 G05 LEPREL1 -0.889157 0.244954 4.972440 4.312032 1.626191 7.261316 0 0 0 None NaN NaN NaN
4 9924 22795 G06 NID2 2.166479 -0.034532 5.174375 3.790967 -0.653718 7.253175 0 0 0 None NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14502 53867 5290 G06 PIK3CA 0.109405 NaN NaN NaN NaN 0.109405 0 0 0 14496 7.064018 100.0 7.064018
14503 10422 7786 G02 MAP3K12 -0.360605 NaN NaN NaN NaN -0.360605 0 0 0 14497 7.063530 100.0 7.063530
14504 71971 3747 G08 KCNC2 NaN NaN NaN NaN -1.099003 -1.099003 0 0 0 14498 7.063043 100.0 7.063043
14505 7993 6170 G02 RPL39 NaN NaN NaN NaN -2.406590 -2.406590 0 0 0 14499 7.062556 100.0 7.062556
14506 100015186 84435 G08 GPR123 NaN NaN NaN NaN NaN NaN 0 0 0 14500 7.062069 100.0 7.062069

14507 rows Ɨ 17 columns

InĀ [130]:
# Store your DataFrames in a dictionary
df_dict = {
    # "Old HPA Unfavorable": Old_HPA_unfavorable_df.copy(),
    "New HPA Unfavorable": New_HPA_unfavorable_df.copy(),
    "New HPA Favorable": HPA_new_favorable_df.copy(),
    "Cervical Unfavorable": HPA_Cervical_Unfavor_df.copy(),
    "Cervical Favorable": HPA_Cervical_favor_df.copy(),
    "Cisplatin Resistance": Cisplatin_Resistance_df.copy(),
    "5-FU Resistance": FU_Resistance_df.copy(),# or Cisplatin_Resistance_df if you prefer
}

# Helper function to compute precision, recall, and max precision
def compute_pr_metrics(df):
    df = df.copy()
    first_hit_index = df[df['Hit'] == 1].index.min()
    
    if pd.isna(first_hit_index):
        df['Precision'] = 0
        df['Recall'] = 0
        df['Rank'] = range(1, len(df) + 1)
    else:
        df['Rank'] = None
        df.loc[first_hit_index:, 'Rank'] = range(1, len(df) - first_hit_index + 1)

        precision_values = []
        recall_values = []
        precision_cumulative_hit = 0
        recall_cumulative_hit = 0
        hit_count = df['Hit'].sum()

        for idx, row in df.iterrows():
            if idx < first_hit_index:
                precision_values.append(None)
                recall_values.append(None)
                continue

            precision_cumulative_hit += row['Hit']
            recall_cumulative_hit += row['Hit']
            precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
            recall = recall_cumulative_hit / hit_count * 100

            precision_values.append(precision)
            recall_values.append(recall)

        df['Precision'] = precision_values
        df['Recall'] = recall_values
        df['Max(Precision)'] = df['Precision'].iloc[::-1].cummax()[::-1]

    return df

# Plot setup
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(24, 14))
axes = axes.flatten()

color_prec = '#dc143cff'
color_recall = '#003e98ff'

# Loop through dataframes and axes
for ax, (title, df) in zip(axes, df_dict.items()):
    df = compute_pr_metrics(df)

    rank = df['Rank']
    precision = df['Max(Precision)']
    recall = df['Recall'].apply(lambda x: float(f"{x:.2g}"))  # 2 significant figures

    # Compute F1 score and location
    f1_scores = 2 * (precision * recall) / (precision + recall)
    diff = np.abs(precision - recall)
    min_diff_idx = diff.idxmin()
    f1_val = f1_scores[min_diff_idx]
    rank_at_f1 = rank[min_diff_idx]
    min_precision = precision.min()

    # Calculate hit percentage in top 10%
    top_10_cutoff = int(len(df) * 0.1)
    top_10_hits = df.iloc[:top_10_cutoff]['Hit'].sum()
    total_hits = df['Hit'].sum()
    hit_pct_top10 = (top_10_hits / top_10_cutoff  * 100) if total_hits else 0

    # Plot precision on primary y-axis
    ax.plot(rank, precision, label='Precision', linewidth=4.0, color=color_prec)
    ax.set_xlabel('Rank (log scale)')
    ax.set_ylabel('Precision (%)', color=color_prec)
    ax.set_xscale('log')
    ax.set_ylim(0, 105)
    ax.set_xlim(rank.min(), rank.max())
    ax.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
    ax.tick_params(axis='x', which='both', bottom=True, top=False)
    ax.tick_params(axis='y', labelcolor=color_prec)
    ax.axhline(y=min_precision, color='black', linestyle='--', linewidth=1.5)

    # Remove minor ticks manually (important fix)
    ax.xaxis.set_minor_locator(plt.NullLocator())

    # Plot recall on secondary y-axis
    ax2 = ax.twinx()
    ax2.plot(rank, recall, label='Recall', linewidth=4.0, color=color_recall)
    ax2.set_ylabel('Recall (%)', color=color_recall)
    ax2.set_ylim(0, 105)
    ax2.tick_params(axis='y', labelcolor=color_recall)

    # Title with F1 and Top 10% hit info
    ax.set_title(f'{title} PR Curve\nF1 ā‰ˆ {f1_val:.2f} @ Rank {int(rank_at_f1)}\nTop 10% Hit Rate: {hit_pct_top10:.1f}%')

# Adjust layout
plt.tight_layout()
plt.suptitle(f'Precision-Recall Curves {Chosen_combining_methods}', fontsize=40, y=1.1)
plt.subplots_adjust(top=0.9)
plt.show()
No description has been provided for this image
InĀ [131]:
top_10_n = int(len(HPA_Cervical_Unfavor_df ['Rank']) * 0.10)
top10_df = HPA_Cervical_Unfavor_df .iloc[:top_10_n]
observed_hits = top10_df['Hit'].sum()
rest_df = HPA_Cervical_Unfavor_df .iloc[top_10_n:]
rest_n = int(len(HPA_Cervical_Unfavor_df['Rank']) * 0.90)
InĀ [132]:
observed_hits 
Out[132]:
np.int64(29)
InĀ [133]:
rest_hits = np.array(rest_df['Hit']) 

random_indices = np.random.choice(len(rest_hits), size=top_10_n, replace=True)
# Sum of hits for the randomly selected subset
random_hits_sum = rest_hits[random_indices].sum()

print(random_hits_sum)
33

Kaplan-Meier Survival¶

InĀ [134]:
from lifelines import KaplanMeierFitter, statistics
InĀ [135]:
# def find_best_cutoff(df, fpkm_col='pTPM', time_col='Days', status_col='Status'):
#     df = df[[fpkm_col, time_col, status_col]].dropna()
#     cutoffs = np.percentile(df[fpkm_col], np.arange(20, 81))  # avoid extremes
#     best_p = 1.0
#     best_cut = None

#     for cut in cutoffs:
#         df['Group'] = df[fpkm_col] > cut
#         try:
#             groups = df.groupby('Group')
#             T1, E1 = groups.get_group(True)[time_col], groups.get_group(True)[status_col]
#             T2, E2 = groups.get_group(False)[time_col], groups.get_group(False)[status_col]
#             result = statistics.logrank_test(T1, T2, E1, E2)
#             if result.p_value < best_p:
#                 best_p = result.p_value
#                 best_cut = cut
#         except KeyError:
#             continue  # if one group has no samples
#     return best_cut, best_p
InĀ [136]:
from matplotlib.lines import Line2D

surival_df = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="CCDC47 - 5FU Liver")

# Clean and preprocess data
surival_df['Status'] = surival_df['Status'].astype(str).str.strip().str.lower().map({'dead': 1, 'alive': 0})
surival_df['Days'] = surival_df['Days'].str.extract(r'(\d+)').astype(float)
surival_df['Years'] = surival_df['Days'] / 365.25  # Convert to years
surival_df['pTPM'] = surival_df['pTPM'].astype(float)

# Divide into High and Low based on cutoff from HPA
Cutoff = 84.06  # From HPA webpage 20250731
logrankP = 0.00078 # From HPA webpage 20250731
surival_df['pTPM_group'] = surival_df['pTPM'].apply(lambda x: 'High' if x > Cutoff else 'Low')

# Define colors and line styles for groups
colors = {'High': '#007A03', 'Low': '#FFA90E'}

# Plot KM curves
kmf = KaplanMeierFitter()
plt.figure(figsize=(8, 8))

# Plot and collect legend handles
legend_handles = []

for group in ['High', 'Low']:
    group_data = surival_df[surival_df['pTPM_group'] == group]
    label = f"{group} (n={len(group_data)})"
    kmf.fit(durations=group_data['Years'], event_observed=group_data['Status'], label=label)
    ax = kmf.plot_survival_function(ci_show=False, color=colors[group], linewidth=7)
    # Create custom legend handle
    handle = Line2D([0], [0], color=colors[group], lw=7, label=label)
    legend_handles.append(handle)

# Add dummy lines for cutoff and p-value (invisible but shows in legend)
legend_handles.append(Line2D([0], [0], color='none', label=f"Cutoff = {Cutoff}"))
legend_handles.append(Line2D([0], [0], color='none', label=f"P = {logrankP:.5f}"))

# Final plot adjustments
plt.title("CESC Survival by PGM1 expression", fontsize= 25)
plt.grid(False)
plt.xlabel("Time (years)", fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.ylabel("Survival Probability", fontsize=25)
plt.ylim(0, 1.05)
plt.xlim(0, 18)
plt.tight_layout()

# Add custom legend with all handles
plt.legend(handles=legend_handles, loc='upper right', frameon=False, fontsize = 20)

plt.show()
No description has been provided for this image
InĀ [137]:
def plot_kaplan_meier(df, gene_name, cutoff, logrank_p, title,
                      colors=None, figsize=(8, 8)):
    """
    Plots a Kaplan–Meier curve for High/Low expression groups based on pTPM cutoff.
    
    Parameters:
        df (DataFrame): Must contain 'Status', 'Days', and 'pTPM' columns.
        gene_name (str): Gene name (for annotation only).
        cutoff (float): Expression cutoff to define High/Low groups.
        logrank_p (float): Pre-computed log-rank p-value.
        title (str): Title of the plot.
        colors (dict): Optional dict like {'High': '#FF844D', 'Low': '#4DFFED'}.
        figsize (tuple): Size of the plot.
        max_years (int or float): X-axis upper limit in years.
    """
    if colors is None:
        colors = {'High': '#007A03', 'Low': '#FFA90E'}

    # Preprocess input data
    df = df.copy()
    df['Status'] = df['Status'].astype(str).str.strip().str.lower().map({'dead': 1, 'alive': 0})
    df['Days'] = df['Days'].astype(str).str.extract(r'(\d+)').astype(float)
    df['Years'] = df['Days'] / 365.25
    df['pTPM'] = df['pTPM'].astype(float)
    df = df.dropna(subset=['Years', 'Status', 'pTPM'])

    # Grouping based on cutoff
    df['pTPM_group'] = df['pTPM'].apply(lambda x: 'High' if x > cutoff else 'Low')
    
    max_years = df['Years'].max()

    # Plot setup
    plt.figure(figsize=figsize)
    kmf = KaplanMeierFitter()
    group_handles = {}

    for group in ['Low', 'High']:
        group_data = df[df['pTPM_group'] == group]
        label = f"{group} (n={len(group_data)})"
        kmf.fit(durations=group_data['Years'], event_observed=group_data['Status'], label=label)
        kmf.plot_survival_function(ci_show=False, color=colors[group], linewidth=7)
        handle = Line2D([0], [0], color=colors[group], lw=7, label=label)
        group_handles[group] = handle

    # Add cutoff and p-value as text-only legend entries
    legend_handles = [
        group_handles['High'],
        group_handles['Low'],
        Line2D([0], [0], color='none', label=f"p = {logrank_p:.5f}")
    ]

    fontsize = 25
    # Final plot settings
    plt.title(title, fontsize=fontsize)
    plt.xlabel("Time (years)", fontsize=fontsize)
    plt.ylabel("Survival Probability",fontsize=fontsize)
    plt.ylim(0.05, 1.05)
    plt.xlim(0, max_years*1.05)
    plt.yticks(fontsize=fontsize)
    plt.xticks(fontsize=fontsize)     
    plt.grid(False)
    plt.tight_layout()
    plt.legend(handles=legend_handles, loc='upper right', frameon=False, fontsize=20)
    
    os.makedirs(graphs_files_stats, exist_ok=True)
    filename = f"{directory}_{gene_name}.svg"
    path = os.path.join(graphs_files_stats, filename)
    plt.savefig(path, format='svg', dpi=1000)
    print(f"Saved: {filename} to {path}")
    
    plt.show()
InĀ [138]:
CCDC47 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="CCDC47 - 5FU Liver")
SH3BP2 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="SH3BP2 - Cis Pan")
SLIRP = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="SLIRP - Cer Fav")
PGM1 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="PGM1 - Cer Unfav")
InĀ [139]:
plot_kaplan_meier(CCDC47, gene_name="CCDC47", cutoff=84.06, logrank_p=0.00078,
                  title="LIHC CCDC47 Expression")
plot_kaplan_meier(SH3BP2, gene_name="SH3BP2", cutoff=13.82, logrank_p=0.017,
                  title="PAAD SH3BP2 Expression")
plot_kaplan_meier(SLIRP, gene_name="SLIRP", cutoff=162.41, logrank_p=0.000017,
                  title="CESC SLIRP Expression")
plot_kaplan_meier(PGM1, gene_name="PGM1", cutoff=62.51, logrank_p=0.00033,
                  title="CESC PGM1 Expression")
Saved: RQ023682_CCDC47.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_CCDC47.svg
No description has been provided for this image
Saved: RQ023682_SH3BP2.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_SH3BP2.svg
No description has been provided for this image
Saved: RQ023682_SLIRP.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_SLIRP.svg
No description has been provided for this image
Saved: RQ023682_PGM1.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_PGM1.svg
No description has been provided for this image
InĀ [140]:
Zscore_values = FU_Resistance_df[['ORF_ID', 'NCBI', 'Gene_Symbol', 'Combined_Z_Score' ]].copy().sort_values(by='Combined_Z_Score', ascending=False)

# Zscore_values = FU_Resistance_df[['ORF_ID', 'NCBI', 'Gene_Symbol', 'Combined_Z_Score' ]].copy()
InĀ [141]:
Zscore_values_clean = Zscore_values.dropna(subset=['Combined_Z_Score']).copy()
Zscore_values_clean['Rank'] = np.arange(1, len(Zscore_values_clean) + 1)

plt.figure(figsize=(7, 7)) # Set the figure size for better readability

# Scatter plot: Each point represents a gene
# X-axis: Rank (derived from the sorted index)
# Y-axis: Combined_Z_Score
plt.scatter(Zscore_values_clean['Rank'], Zscore_values_clean['Combined_Z_Score'],
            s=10,  # Size of the points
            alpha=0.6, # Transparency of the points
            color='grey', # Color of the points
            label='Gene Z-Score')

# Optional: Add a line connecting the points if you want to emphasize the trend
# plt.plot(Zscore_values_clean['Rank'], Zscore_values_clean['Combined_Z_Score'],
#          color='red', linestyle='-', linewidth=0.5, alpha=0.5, label='Z-Score Trend')

genes_to_highlight = ['CCDC47', 'SH3BP2', 'PGM1', 'SLIRP']
highlight_data = Zscore_values_clean[
    Zscore_values_clean['Gene_Symbol'].isin(genes_to_highlight)
].copy()

# --- Highlighted Genes Plot ---
# Using a larger size, different color, and a border to make them stand out
plt.scatter(highlight_data['Rank'], highlight_data['Combined_Z_Score'],
            s=100, # Larger size
            alpha=1, # Fully opaque
            color='red', # Distinct color
            edgecolor='black', # Black border
            zorder=5, # Ensure these points are on top of others
            label='Highlighted Genes')

# Add annotations for highlighted genes
for i, row in highlight_data.iterrows():
    plt.annotate(
        row['Gene_Symbol'],
        (row['Rank'], row['Combined_Z_Score']),
        xytext=(5, 5), # Offset text slightly from the point
        textcoords='offset points',
        fontsize=9,
        color='darkred',
        ha='left', # Horizontal alignment
        va='bottom' # Vertical alignment
    )

# Add labels and title
plt.xlabel('Gene Rank')
plt.ylabel('Z-Score')
plt.title('Gene Rank Plot: Combined Z-Score vs. Rank')

# Add a horizontal line at Z-score = 0 for reference
plt.axhline(y=0, color='gray', linestyle='--', linewidth=1, label='Z-Score = 0')

# Customize ticks and grid for better readability
plt.xticks(np.linspace(1, len(Zscore_values_clean), 5, dtype=int), rotation=45, ha="right") # Show 10 ticks evenly spaced
plt.grid(True, linestyle='--', alpha=0.7)

# Add a legend
# plt.legend()

# Display the plot
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
No description has been provided for this image

Pearson Correlation Coefficient analysis¶

InĀ [142]:
#Importing tqdm for jupyter notebook
from tqdm.notebook import tqdm

# Check if the result files already exist
PCC_testing_path = os.path.join(database_files_stats, f"{directory}_PCC_Result_Full.csv")
PCC_filter_path = os.path.join(database_files_stats, f"{directory}_PCC_Result_filter.xlsx")

if os.path.isfile(PCC_testing_path) and os.path.isfile(PCC_filter_path):
    print("Result files already exist. Skipping PCC calculation.")
else:
    # Continue with your existing code for PCC calculation

    # Change dataframe to Fold Changes only
    FC_columns = Final_pval_df[['NCBI'] + list(Final_pval_df.columns[Final_pval_df.columns.str.startswith('FC_')])]
    FC_columns.set_index('NCBI', inplace=True)

    # Function to calculate PCC and p-value for a pair of genes
    def calculate_pcc_and_pval(gene_pair):
        gene_a, gene_b = gene_pair
        expression_a = FC_columns.loc[gene_a].values
        expression_b = FC_columns.loc[gene_b].values
        pcc, p_val = pearsonr(expression_a, expression_b)
        return gene_a, gene_b, pcc, p_val

    # Get all unique combinations of genes
    gene_combinations = list(combinations(FC_columns.index, 2))

    # Use multiprocessing for parallel computation
    num_cores = cpu_count()
    with Pool(num_cores) as pool:
        results = list(tqdm(pool.imap(calculate_pcc_and_pval, gene_combinations), total=len(gene_combinations), desc='Processing'))

    # Create a DataFrame from the results
    result_values = pd.DataFrame(results, columns=['Gene A', 'Gene B', 'PCC', 'P-value'])

    # Save a full result into CSV file 
    result_values.to_csv(PCC_testing_path, index=False)

    # Get the filtered result
    PCC_filter = result_values[(result_values['PCC'] > 0.9) | (result_values['PCC'] < -0.9) & (result_values['P-value'] < 0.05)]

    # Save a filtered result into excel file
    PCC_filter.to_excel(PCC_filter_path, index=False)
Result files already exist. Skipping PCC calculation.

Spearmen Correlation Coefficient analysis¶

InĀ [143]:
#Importing tqdm for jupyter notebook
from tqdm.notebook import tqdm

# Check if the result files already exist
SCC_testing_path = os.path.join(database_files_stats, f"{directory}_SCC_Result_Full.csv")
SCC_filter_path = os.path.join(database_files_stats, f"{directory}_SCC_Result_filter.xlsx")

if os.path.isfile(SCC_testing_path) and os.path.isfile(SCC_filter_path):
    print("Result files already exist. Skipping PCC calculation.")
else:
    # Continue with your existing code for PCC calculation

    # Change dataframe to Fold Changes only
    FC_columns = Final_pval_df[['NCBI'] + list(Final_pval_df.columns[Final_pval_df.columns.str.startswith('FC_')])]
    FC_columns.set_index('NCBI', inplace=True)

    # Function to calculate PCC and p-value for a pair of genes
    def calculate_scc_and_pval(gene_pair):
        gene_a, gene_b = gene_pair
        expression_a = FC_columns.loc[gene_a].values
        expression_b = FC_columns.loc[gene_b].values
        pcc, p_val = spearmanr(expression_a, expression_b)
        return gene_a, gene_b, pcc, p_val

    # Get all unique combinations of genes
    gene_combinations = list(combinations(FC_columns.index, 2))

    # Use multiprocessing for parallel computation
    num_cores = cpu_count()
    with Pool(num_cores) as pool:
        results = list(tqdm(pool.imap(calculate_scc_and_pval, gene_combinations), total=len(gene_combinations), desc='Processing'))

    # Create a DataFrame from the results
    result_values = pd.DataFrame(results, columns=['Gene A', 'Gene B', 'PCC', 'P-value'])

    # Save a full result into CSV file 
    result_values.to_csv(SCC_testing_path, index=False)

    # Get the filtered result
    PCC_filter = result_values[(result_values['PCC'] > 0.9) | (result_values['PCC'] < -0.9) & (result_values['P-value'] < 0.05)]

    # Save a filtered result into excel file
    PCC_filter.to_excel(SCC_filter_path, index=False)
Result files already exist. Skipping PCC calculation.

Upset Graph¶

InĀ [144]:
# mRNA_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Dox_vs_DMSO_LogFC_pval_v2.xlsx")
# from Bio import Entrez
# def get_entrez_id_from_gene_name(gene_name):
#     if not gene_name:
#         return 'N/A' # Handle empty gene names

#     try:
#         handle = Entrez.esearch(db="gene", term=f"{gene_name}[Gene Name] AND human[Organism]", retmax="1")
#         record = Entrez.read(handle)
#         handle.close()

#         if record["IdList"]:
#             # Entrez IDs are returned as strings in the IdList
#             return record["IdList"][0]
#         else:
#             return "N/A" # Gene not found
#     except Exception as e:
#         print(f"Error fetching Entrez ID for {gene_name}: {e}")
#         return "Error" # Indicate an error occurred

# entrez_ids = []
# for index, row in mRNA_df.iterrows():
#     gene_name = row['gene_id']
#     entrez_id = get_entrez_id_from_gene_name(gene_name)
#     entrez_ids.append(entrez_id)
    
# mRNA_df['entrez_acc_number'] = entrez_ids
# mRNA_df.to_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Dox_vs_DMSO_LogFC_pval_v2_with_entrez.xlsx", index=False)
InĀ [145]:
mRNA_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Dox_vs_DMSO_LogFC_pval_v2_with_entrez.xlsx")
InĀ [146]:
Dox_main = Final_pval[["NCBI", "FC_Doxorubicin", "ovp3_Doxorubicin"]]
Dox_mRNA = mRNA_df[["Entrez_ID", "Full_log2FC", "Full_ppval", "24hr_log2FC", "24hr_ppval"]]
InĀ [147]:
Dox_main_all = Final_pval["NCBI"].dropna().astype(int).unique().tolist()
Dox_mRNA_all = mRNA_df["Entrez_ID"].dropna().astype(int).unique().tolist()
InĀ [148]:
# DEG list 1 from Final_pval based on FC_Doxorubicin and ovp3_Doxorubicin
Main_DEGs = Final_pval[
    ((Final_pval["FC_Doxorubicin"] > 0.5) | (Final_pval["FC_Doxorubicin"] < -0.5)) &
    (Final_pval["ovp3_Doxorubicin"] < 0.05)
]["NCBI"].dropna().astype(int).unique().tolist()

# DEG list 2 from mRNA_df based on Full_log2FC and Full_ppval
Full_DEGs = mRNA_df[
    ((mRNA_df["Full_log2FC"] > 0.5) | (mRNA_df["Full_log2FC"] < -0.5)) &
    (mRNA_df["Full_ppval"] < 0.05)
]["Entrez_ID"].dropna().astype(int).unique().tolist()

# DEG list 3 from mRNA_df based on 24hr_log2FC and 24hr_ppval
short_DEGs = mRNA_df[
    ((mRNA_df["24hr_log2FC"] > 0.5) | (mRNA_df["24hr_log2FC"] < -0.5)) &
    (mRNA_df["24hr_ppval"] < 0.05)
]["Entrez_ID"].dropna().astype(int).unique().tolist()
InĀ [149]:
from upsetplot import UpSet, from_memberships
import matplotlib.pyplot as plt
InĀ [150]:
# import warnings
# warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=UserWarning)

# # Optional: Set global font style
# mpl.rcParams.update({
#     'font.size': 12,         # Base font size
#     'axes.titlesize': 16,    # Title font
#     'axes.labelsize': 14,    # Axis labels
#     'xtick.labelsize': 12,   # Tick labels
#     'ytick.labelsize': 12,
#     'legend.fontsize': 12,
# })

# # Reuse your data preparation from before
# deg_sets = {
#     "ORFeome Screening": set(Dox_main_all),
#     "mRNA seq": set(Dox_mRNA_all),
#     "ORFeome DEGs": set(Main_DEGs),
#     "120 hr screening": set(Full_DEGs),
#     "24 hr screening": set(short_DEGs),
# }

# all_genes = set.union(*deg_sets.values())
# memberships = [[name for name, s in deg_sets.items() if gene in s] for gene in all_genes]
# upset_data = from_memberships(memberships)

# # Plot with adjusted layout and title
# plt.figure(figsize=(14, 8))
# up = UpSet(upset_data, subset_size='count', show_counts=True)
# up.plot()

# # Optional: Add a bigger title
# plt.suptitle("UpSet Plot of DEG List Overlaps", fontsize=18)

# # Optional: Adjust layout manually instead of tight_layout
# plt.subplots_adjust(hspace=0.4, top=0.85)
# plt.show()
InĀ [151]:
from upsetplot import UpSet, from_memberships
import matplotlib.pyplot as plt

# DEG group for UpSet plot
deg_sets = {
    "Main_DEGs": set(Main_DEGs),
    "Full_DEGs": set(Full_DEGs),
    "short_DEGs": set(short_DEGs),
}

# Build membership from union of genes
all_genes = set.union(*deg_sets.values())
memberships = [[name for name, s in deg_sets.items() if gene in s] for gene in all_genes]
upset_data = from_memberships(memberships)

# Plot UpSet
plt.figure(figsize=(10, 6))
up = UpSet(upset_data, show_counts=True, subset_size='count')
up.plot()
plt.suptitle("Overlap Among DEGs (Main, Full, 24hr)", fontsize=16)
plt.subplots_adjust(top=0.85)
plt.show()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/data.py:303: FutureWarning:

Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:795: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:796: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:797: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:798: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



<Figure size 1000x600 with 0 Axes>
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:762: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:763: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:905: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:906: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

No description has been provided for this image
InĀ [152]:
deg_sets = {
    "ORFeomes": set(Main_DEGs),
    "120 hr": set(Full_DEGs),
    "24 hr": set(short_DEGs),
}

Dox_main_all_f = [0] * len(set(Dox_main_all))
Dox_mRNA_all_f = [99999999] * len(set(Dox_mRNA_all))

# Step 3: Combine into membership list
memberships = []

# Add fake memberships (just for bar chart)
memberships += [["Total ORFeomes"] for _ in Dox_main_all_f]
memberships += [["mRNA transcriptomics"] for _ in Dox_mRNA_all_f]

# Add real DEG set memberships
for gene in set.union(*deg_sets.values()):
    gene_membership = [name for name, geneset in deg_sets.items() if gene in geneset]
    memberships.append(gene_membership)

# Step 4: Build UpSet data and plot
from upsetplot import from_memberships

data = from_memberships(memberships)

upset = UpSet(data, show_counts=True, subset_size='count', sort_by='degree', orientation="vertical", min_degree=2)

plt.rcParams["font.size"] = 6
upset.plot()

# Grab the current figure and axes
fig = plt.gcf()
axes = fig.get_axes()

# Usually, the set labels are on the first or second axis — let's check and rotate all x-tick labels
for ax in axes:
    # Rotate x tick labels if they exist
    if ax.get_xticklabels():
        for label in ax.get_xticklabels():
            label.set_rotation(270)
            # label.set_ha('right')

# plt.tight_layout()

# plt.suptitle("Doxorubicin Screening", fontsize=16)
# Define the file path for storing graph
Upset_path = os.path.join(graphs_files_stats, f"{directory}_upset.svg")
plt.savefig(Upset_path , format='svg', bbox_inches='tight', dpi=1000)
plt.show()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/data.py:303: FutureWarning:

Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:795: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:796: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:797: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:798: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:763: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:762: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:906: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:905: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

No description has been provided for this image
InĀ [153]:
# Select the columns containing the sample data
batch_compile_df_corr = batch_compile_df.fillna(0)
columns_to_include = [str(column) for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
excludes = ("63-mCherryPositive&BFPNegative", "64-mCherryNegative&BFPNegative", "Serumfree")
columns_to_include = [col for col in columns_to_include if not any(exclude in col for exclude in excludes)]
InĀ [154]:
import re
# Stats variable reconfirmed (Overlap between R and Scipy)
stats = importr("stats")

# Select the columns containing the sample data
batch_compile_df_corr = batch_compile_df.fillna(0)
columns_to_include = [str(column) for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
excludes = ("63-mCherryPositive&BFPNegative", "64-mCherryNegative&BFPNegative", "Serumfree")
columns_to_include = [col for col in columns_to_include if not any(exclude in col for exclude in excludes)]
batch_compile_df_corr = batch_compile_df_corr[columns_to_include].astype(float)

# Exclude some columns
excludes = {"63-mCherryPositive&BFPNegative", "64-mCherryNegative&BFPNegative", "Serumfree"}

# Add columns containing specific patterns like 'DMSO' or 'baseline'
pattern_excludes = {'DMSO', 'baseline'}
for col in batch_compile_df_corr.columns:
    if any(pattern.lower() in col.lower() for pattern in pattern_excludes):
        excludes.add(col)

# Drop all the identified columns
batch_compile_df_corr_f = batch_compile_df_corr.drop(columns=list(excludes), errors='ignore')

# Use numeric columns from batch_compile_df_corr_f
numeric_df = batch_compile_df_corr_f.select_dtypes(include='number')
corr_matrix = numeric_df.corr()

# Extract drug names from column names
def extract_drug_name(col):
    match = re.match(r'\d+-(.*)-[A-Z]$', col)
    return match.group(1) if match else col

drug_names = [extract_drug_name(col) for col in corr_matrix.columns]

# Compute positions for one label per drug
tick_positions = []
tick_labels = []
seen = set()
for i, name in enumerate(drug_names):
    if name not in seen:
        tick_positions.append(i)
        tick_labels.append(name)
        seen.add(name)

# Plot the heatmap
fig, ax = plt.subplots(figsize=(16, 16))
heatmap = sns.heatmap(corr_matrix, 
                      cmap='Purples', 
                      annot=False, 
                      linewidths=0,
                      cbar=False,
                      ax=ax)

colorbar = fig.colorbar(heatmap.get_children()[0],
                        ax=ax,
                        orientation='horizontal',
                        shrink=0.5,
                        pad=0.1,
                        location ='top')  # pad controls spacing from the plot

# Add colorbar label
colorbar.set_label("Pearson Correlation Coefficient")


# Set simplified ticks
plt.xticks(tick_positions, tick_labels, rotation=45, ha='center')
plt.yticks(tick_positions, tick_labels, rotation=0)

Figure_PCC_Heatmap_path = os.path.join(graphs_files_original, f"{directory}_Figure_Heatmap.svg")
plt.savefig(Figure_PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=1000)
print(f"{directory}_Post_PCC_Heatmap.svg saved to {Figure_PCC_Heatmap_path }")

plt.tight_layout()
plt.show()
RQ023682_Post_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Figure_Heatmap.svg
No description has been provided for this image
InĀ [155]:
plt.rcParams['axes.labelsize'] = 30  # For X and Y labels
plt.rcParams['xtick.labelsize'] = 30 # For X-axis tick labels
plt.rcParams['ytick.labelsize'] = 30 # For Y-axis tick labels
plt.rcParams['legend.fontsize'] = 30 # For the legend
plt.rcParams['axes.titlesize'] = 30 # For the plot title
plt.rcParams['axes.linewidth'] = 1.5 # For the axes lines
plt.rcParams['font.size'] = 30 # For all text in the plot
InĀ [156]:
Autophagy_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/RQ025080_Final_meanFC_pval.xlsx", sheet_name='Sheet1')
Autophagy_df_all = Autophagy_df["NCBI"].dropna().astype(int).unique().tolist()

Final_all = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20240522_Screening.xlsx", sheet_name='A')
InĀ [157]:
TAS_main = Final_all[['NCBI', 'Verified', 'Silencing', "FC_TAS102", "pval_TAS102"]]
InĀ [158]:
TAS_DEGs = TAS_main[
    ((TAS_main["FC_TAS102"] >= 0.5) | (TAS_main["FC_TAS102"] <= -0.5)) &
    (TAS_main["pval_TAS102"] < 0.05) 
    & (TAS_main["Silencing"] == 'No Silence')
    & (TAS_main["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()
InĀ [159]:
# DEG list 2 from mRNA_df based on Full_log2FC and Full_ppval
Upper_DEG = Autophagy_df[
    ((Autophagy_df["FC_Upper"] >=0.5) | (Autophagy_df["FC_Upper"] <= -0.5)) &
    (Autophagy_df["pval_Upper"] < 0.05) &
    (Autophagy_df["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()

# DEG list 3 from mRNA_df based on 24hr_log2FC and 24hr_ppval
Lower_DEG = Autophagy_df[
    ((Autophagy_df["FC_Lower"] >= 0.5) | (Autophagy_df["FC_Lower"] <= -0.5)) &
    (Autophagy_df["pval_Lower"] < 0.05) &
    (Autophagy_df["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()
InĀ [160]:
# DEG list 2 from mRNA_df based on Full_log2FC and Full_ppval
Upper_DEG_2 = Autophagy_df[
    ((Autophagy_df["FC_Com_Exp_Up"] >= 0.5) | (Autophagy_df["FC_Com_Exp_Up"] <= -0.5)) &
    (Autophagy_df["pval_Com_Exp_Up"] < 0.05) 
    & (Autophagy_df["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()

# DEG list 3 from mRNA_df based on 24hr_log2FC and 24hr_ppval
Lower_DEG_2 = Autophagy_df[
    ((Autophagy_df["FC_Com_Exp_Lo"] >= 0.5) | (Autophagy_df["FC_Com_Exp_Lo"] <= -0.5)) &
    (Autophagy_df["pval_Com_Exp_Lo"] < 0.05) 
    & (Autophagy_df["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()
InĀ [161]:
# All

Upper_DEG_3 = Upper_DEG + Upper_DEG_2
Lower_DEG_3 = Lower_DEG + Lower_DEG_2

auto_degs_3 = {
    "Main DEGs": set(TAS_DEGs),
    "Upper DEGs": set(Upper_DEG_3),
    "Lower DEGs": set(Lower_DEG_3)
}

auto_all_3 = set.union(*auto_degs_3.values())
auto_memberships_3 = [[name for name, s in auto_degs_3.items() if gene in s] for gene in auto_all_3]
auto_upset_3 = from_memberships(auto_memberships_3)

# Plot UpSet
up = UpSet(auto_upset_3, show_counts=True, subset_size='count', sort_by='degree', orientation="vertical", min_degree=2)
up.plot()

Autophagy_path = os.path.join(graphs_files_stats, f"{directory}_Autophagy_Upset.svg")
plt.savefig(Autophagy_path, format='svg', bbox_inches='tight', dpi=1000)
plt.show()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/data.py:303: FutureWarning:

Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:795: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:796: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:797: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:798: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:763: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:762: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:906: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:905: DeprecationWarning:

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

No description has been provided for this image
InĀ [162]:
HSR = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20240522_HSR.xlsx", sheet_name='B')
HSR_original = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20240522_HSR.xlsx", sheet_name='HSR')
Dox_only = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20240522_HSR.xlsx", sheet_name='HSR_only')
InĀ [163]:
# 96 genes from the HSR Doxorubicin bin 
InĀ [164]:
# Dox_main_all
# Main_DEGs
InĀ [165]:
HSR_all = HSR["NCBI"].dropna().astype(int).unique().tolist()
Dox_only_all = Dox_only["NCBI"].dropna().astype(int).unique().tolist()
HSR_original_all = HSR_original["NCBI"].dropna().astype(int).unique().tolist()
HSR_DEGs = HSR[
    ((HSR["FC_Doxorubicin"] >= 0.5) | (HSR["FC_Doxorubicin"] <= -0.5)) &
    (HSR["pval_Doxorubicin"] < 0.05)
]["NCBI"].dropna().astype(int).unique().tolist()
InĀ [166]:
Total_HSR = set(HSR_all)
Total_original_HSR = set(HSR_original_all)

intersection = Total_original_HSR.intersection(Total_HSR)
only_Total_main = Total_original_HSR - intersection
only_Total_HSR = Total_HSR - Total_HSR

fig, ax = plt.subplots(figsize=(16, 16))
venn = venn2(subsets=(len(only_Total_main), len(only_Total_HSR), len(intersection)),
             set_labels=('Doxorubicin Screening', 'HSR Screening'),
             ax=ax)

HSR_all_graph_path = os.path.join(graphs_files_stats, f"{directory}_HSR_all_Venn.svg")
plt.savefig(HSR_all_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Venn5_graph_path}")
plt.tight_layout()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Venn5_Graph.svg
No description has been provided for this image
InĀ [167]:
Dox_sets = set(Dox_only_all)

insersection_HSR = Total_HSR.intersection(Dox_sets)
only_Dox_all = Dox_sets - insersection_HSR
only_Dox = Dox_sets - Dox_sets

fig, ax = plt.subplots(figsize=(16, 16))
venn = venn2(subsets=(len(only_Dox_all), len(only_Dox), len(insersection_HSR)),
             set_labels=('Doxorubicin Screening', 'HSR Screening'),
             ax=ax)

HSR_Dox_graph_path = os.path.join(graphs_files_stats, f"{directory}_HSR_Dox_Venn.svg")
plt.savefig(HSR_Dox_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Venn5_graph_path}")
plt.tight_layout()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Venn5_Graph.svg
No description has been provided for this image
InĀ [168]:
from gprofiler import GProfiler

# --- Step 1: Filter peripheral genes per drug (no exclusivity) ---
peripheral_GOBP = {}

for drug_name in drug_name_list:
    nan_col = f'nan_filter_{drug_name}'
    fc_col = f'FC_{drug_name}'
    pval_col = f'ovp3_{drug_name}'
    
    df_sub = Final_pval[
        (Final_pval[nan_col] == 1) &
        ((Final_pval[fc_col] >= 0.5) 
         | 
         (Final_pval[fc_col] <= -0.5)) &
        (Final_pval[pval_col] <= 0.05)
    ]
    
    genes_this_drug = set(df_sub['NCBI'])
    
    # Remove core genes if Core_Genes defined
    if 'Core_Genes' in globals():
        genes_this_drug.difference_update(set(Core_Genes))
    
    peripheral_GOBP[drug_name] = list(map(str, genes_this_drug))

print("Peripheral gene counts per drug:")
for drug, genes in peripheral_GOBP.items():
    print(f"{drug}: {len(genes)} genes")

# --- Step 2: Run GOBP enrichment for each drug ---
gp = GProfiler(return_dataframe=True)
gobp_results = {}

for drug, gene_list in peripheral_GOBP.items():
    if not gene_list:
        print(f"No genes for {drug}, skipping GOBP")
        continue
    
    print(f"Running GO:BP enrichment for {drug} ({len(gene_list)} genes)...")
    res = gp.profile(
        organism='hsapiens',
        query=gene_list,
        sources=['GO:BP'],
        user_threshold=0.05
    )
    gobp_results[drug] = res
    print(f"  Found {len(res)} enriched GO:BP terms for {drug}")

# --- Step 3: Combine GO results into one DataFrame ---
all_gobp = []
for drug, df in gobp_results.items():
    if df.empty:
        continue
    df = df.copy()
    df['drug'] = drug
    all_gobp.append(df)

combined_gobp = pd.concat(all_gobp, ignore_index=True)

combined_gobp = combined_gobp[['name', 'p_value', 'intersection_size', 'drug']]
combined_gobp.rename(columns={
    'name': 'GO_term',
    'p_value': 'p_adj',
    'intersection_size': 'Count'
}, inplace=True)
Peripheral gene counts per drug:
Paclitaxel: 1612 genes
Cisplatin: 1785 genes
TFT: 1625 genes
FdU: 1844 genes
EdU: 826 genes
Doxorubicin: 1983 genes
5FU: 1789 genes
Carboplatin: 1828 genes
Bleomycin: 1775 genes
Etoposide: 1516 genes
MitomycinC: 1461 genes
Carmustine: 1641 genes
Irinotecan: 1841 genes
6mercaptopurine: 755 genes
Vinblastine: 1329 genes
TAS102: 2046 genes
Running GO:BP enrichment for Paclitaxel (1612 genes)...
  Found 36 enriched GO:BP terms for Paclitaxel
Running GO:BP enrichment for Cisplatin (1785 genes)...
  Found 90 enriched GO:BP terms for Cisplatin
Running GO:BP enrichment for TFT (1625 genes)...
  Found 59 enriched GO:BP terms for TFT
Running GO:BP enrichment for FdU (1844 genes)...
  Found 26 enriched GO:BP terms for FdU
Running GO:BP enrichment for EdU (826 genes)...
  Found 1 enriched GO:BP terms for EdU
Running GO:BP enrichment for Doxorubicin (1983 genes)...
  Found 69 enriched GO:BP terms for Doxorubicin
Running GO:BP enrichment for 5FU (1789 genes)...
  Found 63 enriched GO:BP terms for 5FU
Running GO:BP enrichment for Carboplatin (1828 genes)...
  Found 62 enriched GO:BP terms for Carboplatin
Running GO:BP enrichment for Bleomycin (1775 genes)...
  Found 72 enriched GO:BP terms for Bleomycin
Running GO:BP enrichment for Etoposide (1516 genes)...
  Found 40 enriched GO:BP terms for Etoposide
Running GO:BP enrichment for MitomycinC (1461 genes)...
  Found 53 enriched GO:BP terms for MitomycinC
Running GO:BP enrichment for Carmustine (1641 genes)...
  Found 50 enriched GO:BP terms for Carmustine
Running GO:BP enrichment for Irinotecan (1841 genes)...
  Found 46 enriched GO:BP terms for Irinotecan
Running GO:BP enrichment for 6mercaptopurine (755 genes)...
  Found 15 enriched GO:BP terms for 6mercaptopurine
Running GO:BP enrichment for Vinblastine (1329 genes)...
  Found 17 enriched GO:BP terms for Vinblastine
Running GO:BP enrichment for TAS102 (2046 genes)...
  Found 206 enriched GO:BP terms for TAS102
InĀ [169]:
# Build mapping from Entrez ID (NCBI) to Gene Symbol
id2symbol = (
    Final_pval[['NCBI', 'Gene_Symbol']]
    .dropna()
    .drop_duplicates('NCBI')
    .set_index('NCBI')['Gene_Symbol']
    .to_dict()
)

peripheral_GSEApy = {}
core = set(Core_Genes) if 'Core_Genes' in globals() else set()

for drug_name in drug_name_list:
    ids = peripheral_dfs_select.get(drug_name, [])
    # Convert IDs to symbols, skipping unmapped ones
    symbols = [id2symbol.get(int(i)) for i in ids if int(i) in id2symbol]
    # Remove None values and strip whitespace
    symbols = [s.strip() for s in symbols if isinstance(s, str) and s.strip()]
    # Remove core genes if needed
    symbols = sorted(set(symbols) - core)
    peripheral_GSEApy[drug_name] = symbols

# Optional sanity check
for d, genes in peripheral_GSEApy.items():
    print(f"{d}: {len(genes)} mapped gene symbols")
Paclitaxel: 214 mapped gene symbols
Cisplatin: 257 mapped gene symbols
TFT: 192 mapped gene symbols
FdU: 258 mapped gene symbols
EdU: 174 mapped gene symbols
Doxorubicin: 270 mapped gene symbols
5FU: 198 mapped gene symbols
Carboplatin: 301 mapped gene symbols
Bleomycin: 236 mapped gene symbols
Etoposide: 270 mapped gene symbols
MitomycinC: 205 mapped gene symbols
Carmustine: 215 mapped gene symbols
Irinotecan: 324 mapped gene symbols
6mercaptopurine: 167 mapped gene symbols
Vinblastine: 218 mapped gene symbols
TAS102: 333 mapped gene symbols
InĀ [879]:
# Build categorical_GSEApy from category_dfs_select
categorical_GSEApy = {}

for category_name in category_dfs_select.keys():
    ids = category_dfs_select.get(category_name, [])
    # Convert IDs to symbols, skipping unmapped ones
    symbols = [id2symbol.get(int(i)) for i in ids if int(i) in id2symbol]
    # Remove None values and strip whitespace
    symbols = [s.strip() for s in symbols if isinstance(s, str) and s.strip()]
    # Remove core genes if needed
    symbols = sorted(set(symbols) - core)
    categorical_GSEApy[category_name] = symbols

# Optional sanity check
for c, genes in categorical_GSEApy.items():
    print(f"{c}: {len(genes)} mapped gene symbols")
Antimetabolite: 231 mapped gene symbols
DNA cross linking agent: 247 mapped gene symbols
DNA strand break agent: 342 mapped gene symbols
Microtubule inhibitor: 252 mapped gene symbols
InĀ [450]:
silenced_genes = set(Final_pval_df.loc[Final_pval_df["Silencing"] == "Silenced", "NCBI"])
InĀ [Ā ]:
import gseapy as gp

# Input data should be Gene Symbols
peripheral_GSEApy = {}

for drug_name in drug_name_list:
    nan_col = f'nan_filter_{drug_name}'
    fc_col = f'FC_{drug_name}'
    pval_col = f'ovp3_{drug_name}'
    
    df_sub = Final_pval[
        (Final_pval[nan_col] == 1) &
        ((Final_pval[fc_col] >= 0.5) 
         | 
         (Final_pval[fc_col] <= -0.5)
         ) 
        &
        (Final_pval[pval_col] <= 0.05)
    ]
    
    genes_this_drug = set(df_sub['Gene_Symbol'])  # make sure you use gene symbols

    if 'Core_Genes' in globals():
        genes_this_drug.difference_update(set(Core_Genes))

    peripheral_GSEApy[drug_name] = list(genes_this_drug)

gsea_results = {}

for drug, gene_list in peripheral_GSEApy.items():
    if not gene_list:
        print(f"No genes for {drug}, skipping GO:BP enrichment")
        continue

    print(f"Running Enrichr GO:BP enrichment for {drug} ({len(gene_list)} genes)...")

    try:
        enr = gp.enrichr(
            gene_list=gene_list,
            # gene_sets="KEGG_2021_Human",  # Use the latest GO:BP database
            gene_sets=["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human", 'GO_Biological_Process_2025', 'BioPlanet_2019', 'GO_Molecular_Function_2025'], 
            # gene_sets= ["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human"], 
            organism="Human",
            outdir=None,  # don’t write to disk
            cutoff=0.05
        )
        df_res = enr.results
        df_res['drug'] = drug
        gsea_results[drug] = df_res
        print(f" Found {len(df_res)} enriched terms for {drug}")

    except Exception as e:
        print(f" Error processing {drug}: {e}")

all_gsea = [df for df in gsea_results.values() if not df.empty]
if all_gsea:
    combined_gsea = pd.concat(all_gsea, ignore_index=True)
    print(f"Total enriched terms across drugs: {combined_gsea['Term'].nunique()}")
else:
    combined_gsea = pd.DataFrame()
    print("No enrichment terms found.")

combined_gsea.rename(columns={
    'Term': 'GO_term',
    'Adjusted P-value': 'p_adj',
    'Overlap': 'Overlap',
    'Genes': 'Genes'
}, inplace=True)
InĀ [608]:
# Helper function: convert a set of Entrez IDs to unique, sorted gene symbols
def ids_to_symbols(id_set, id2symbol):
    symbols = [id2symbol.get(int(i)) for i in id_set if int(i) in id2symbol]
    symbols = [s.strip() for s in symbols if isinstance(s, str) and s.strip()]
    return sorted(set(symbols))

# Convert each gene set
core_symbols = ids_to_symbols(Core_Genes, id2symbol)
multidrug_symbols = ids_to_symbols(multidrug_genes, id2symbol)
nonresp_symbols = ids_to_symbols(non_respondent_genes, id2symbol)
silenced_symbols = ids_to_symbols(silenced_genes, id2symbol)

# Put them into other_GSEApy
other_GSEApy = {
    "Core": core_symbols,
    "Multidrug": multidrug_symbols,
    "Non-Respondent": nonresp_symbols,
    "Silenced ORFs": silenced_symbols,
}

# Optional sanity check
for k, v in other_GSEApy.items():
    print(f"{k}: {len(v)} mapped gene symbols")
Core: 197 mapped gene symbols
Multidrug: 6556 mapped gene symbols
Non-Respondent: 2849 mapped gene symbols
Silenced ORFs: 1908 mapped gene symbols
InĀ [651]:
import gseapy as gp

# --- Step 1: Filter peripheral genes per drug (no exclusivity) ---
peripheral_GSEApy = {}

for drug_name in drug_name_list:
    nan_col = f'nan_filter_{drug_name}'
    fc_col = f'FC_{drug_name}'
    pval_col = f'ovp3_{drug_name}'
    
    df_sub = Final_pval[
        (Final_pval[nan_col] == 1) &
        ((Final_pval[fc_col] >= 0.5) 
         | 
         (Final_pval[fc_col] <= -0.5)
         ) 
        &
        (Final_pval[pval_col] <= 0.05)
    ]
    
    genes_this_drug = set(df_sub['Gene_Symbol'])  # make sure you use gene symbols

    if 'Core_Genes' in globals():
        genes_this_drug.difference_update(set(Core_Genes))

    peripheral_GSEApy[drug_name] = list(genes_this_drug)


# peripheral_GSEApy = {}
# core = set(Core_Genes) if 'Core_Genes' in globals() else set()

# for drug_name in drug_name_list:
#     ids = peripheral_dfs_select.get(drug_name, [])
#     # Convert IDs to symbols, skipping unmapped ones
#     symbols = [id2symbol.get(int(i)) for i in ids if int(i) in id2symbol]
#     # Remove None values and strip whitespace
#     symbols = [s.strip() for s in symbols if isinstance(s, str) and s.strip()]
#     # Remove core genes if needed
#     symbols = sorted(set(symbols) - core)
#     peripheral_GSEApy[drug_name] = symbols

# print("Peripheral gene counts per drug:")
# for drug, genes in peripheral_GSEApy.items():
#     print(f"{drug}: {len(genes)} genes")

# --- Step 2: Run GO:BP enrichment with GSEApy (Enrichr) ---
gsea_results = {}

for drug, gene_list in peripheral_GSEApy.items():
    if not gene_list:
        print(f"No genes for {drug}, skipping GO:BP enrichment")
        continue

    print(f"Running Enrichr GO:BP enrichment for {drug} ({len(gene_list)} genes)...")

    try:
        enr = gp.enrichr(
            gene_list=gene_list,
            # gene_sets="KEGG_2021_Human",  # Use the latest GO:BP database
            gene_sets=["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human", 'GO_Biological_Process_2025', 'BioPlanet_2019', 'GO_Molecular_Function_2025'], 
            # gene_sets= ["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human"], 
            organism="Human",
            outdir=None,  # don’t write to disk
            cutoff=0.05
        )
        df_res = enr.results
        df_res['drug'] = drug
        gsea_results[drug] = df_res
        print(f" Found {len(df_res)} enriched terms for {drug}")

    except Exception as e:
        print(f" Error processing {drug}: {e}")

# --- Step 3: Combine results ---
all_gsea = [df for df in gsea_results.values() if not df.empty]
if all_gsea:
    combined_gsea = pd.concat(all_gsea, ignore_index=True)
    print(f"Total enriched terms across drugs: {combined_gsea['Term'].nunique()}")
else:
    combined_gsea = pd.DataFrame()
    print("No enrichment terms found.")

# --- Optional: Rename columns for consistency ---
combined_gsea.rename(columns={
    'Term': 'GO_term',
    'Adjusted P-value': 'p_adj',
    'Overlap': 'Overlap',
    'Genes': 'Genes'
}, inplace=True)
Running Enrichr GO:BP enrichment for Paclitaxel (1701 genes)...
 Found 8573 enriched terms for Paclitaxel
Running Enrichr GO:BP enrichment for Cisplatin (1907 genes)...
 Found 8692 enriched terms for Cisplatin
Running Enrichr GO:BP enrichment for TFT (1751 genes)...
 Found 8651 enriched terms for TFT
Running Enrichr GO:BP enrichment for FdU (1969 genes)...
 Found 8790 enriched terms for FdU
Running Enrichr GO:BP enrichment for EdU (866 genes)...
 Found 6422 enriched terms for EdU
Running Enrichr GO:BP enrichment for Doxorubicin (2099 genes)...
 Found 9185 enriched terms for Doxorubicin
Running Enrichr GO:BP enrichment for 5FU (1900 genes)...
 Found 8820 enriched terms for 5FU
Running Enrichr GO:BP enrichment for Carboplatin (1957 genes)...
 Found 8820 enriched terms for Carboplatin
Running Enrichr GO:BP enrichment for Bleomycin (1877 genes)...
 Found 8779 enriched terms for Bleomycin
Running Enrichr GO:BP enrichment for Etoposide (1594 genes)...
 Found 8178 enriched terms for Etoposide
Running Enrichr GO:BP enrichment for MitomycinC (1541 genes)...
 Found 8324 enriched terms for MitomycinC
Running Enrichr GO:BP enrichment for Carmustine (1751 genes)...
 Found 8535 enriched terms for Carmustine
Running Enrichr GO:BP enrichment for Irinotecan (1952 genes)...
 Found 8614 enriched terms for Irinotecan
Running Enrichr GO:BP enrichment for 6mercaptopurine (785 genes)...
 Found 6218 enriched terms for 6mercaptopurine
Running Enrichr GO:BP enrichment for Vinblastine (1390 genes)...
 Found 7614 enriched terms for Vinblastine
Running Enrichr GO:BP enrichment for TAS102 (2182 genes)...
 Found 9316 enriched terms for TAS102
Total enriched terms across drugs: 11055
InĀ [820]:
other_gsea_results = {}

# --- Step 2: Run GO:BP enrichment with GSEApy (Enrichr) ---
for category, gene_list in other_GSEApy.items():
    if not gene_list:
        print(f"No genes for {category}, skipping GO:BP enrichment")
        continue

    print(f"Running Enrichr GO:BP enrichment for {category} ({len(gene_list)} genes)...")

    try:
        enr = gp.enrichr(
            gene_list=gene_list,
            # you can include multiple libraries here
            gene_sets=["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human", 'GO_Biological_Process_2025', 'GO_Molecular_Function_2025'], 
            # gene_sets="KEGG_2021_Human", 
            organism="Human",
            outdir=None,
            cutoff=0.05
        )
        df_res = enr.results
        df_res['category'] = category
        other_gsea_results[category] = df_res
        print(f" Found {len(df_res)} enriched terms for {category}")

    except Exception as e:
        print(f" Error processing {category}: {e}")

# --- Step 3: Combine results ---
all_other_gsea = [df for df in other_gsea_results.values() if not df.empty]
if all_other_gsea:
    combined_other_gsea = pd.concat(all_other_gsea, ignore_index=True)
    print(f"Total enriched terms across categories: {combined_other_gsea['Term'].nunique()}")
else:
    combined_other_gsea = pd.DataFrame()
    print("No enrichment terms found for other_GSEApy.")

# --- Optional: Rename columns for consistency ---
combined_other_gsea.rename(columns={
    'Term': 'GO_term',
    'Adjusted P-value': 'p_adj',
    'Overlap': 'Overlap',
    'Genes': 'Genes'
}, inplace=True)
Running Enrichr GO:BP enrichment for Core (197 genes)...
 Found 1897 enriched terms for Core
Running Enrichr GO:BP enrichment for Multidrug (6556 genes)...
 Found 9516 enriched terms for Multidrug
Running Enrichr GO:BP enrichment for Non-Respondent (2849 genes)...
 Found 8053 enriched terms for Non-Respondent
Running Enrichr GO:BP enrichment for Silenced ORFs (1908 genes)...
 Found 7590 enriched terms for Silenced ORFs
Total enriched terms across categories: 9703
InĀ [881]:
categorical_gsea_results = {}

# --- Step 2: Run GO:BP enrichment with GSEApy (Enrichr) ---
for category, gene_list in categorical_GSEApy.items():
    if not gene_list:
        print(f"No genes for {category}, skipping GO:BP enrichment")
        continue

    print(f"Running Enrichr GO:BP enrichment for {category} ({len(gene_list)} genes)...")

    try:
        enr = gp.enrichr(
            gene_list=gene_list,
            gene_sets=[
                "KEGG_2021_Human",
                "Reactome_Pathways_2024",
                "WikiPathways_2024_Human",
                "GO_Biological_Process_2025",
                "GO_Molecular_Function_2025"
            ],
            organism="Human",
            outdir=None,
            cutoff=0.05
        )
        df_res = enr.results
        df_res['category'] = category
        categorical_gsea_results[category] = df_res
        print(f" Found {len(df_res)} enriched terms for {category}")

    except Exception as e:
        print(f" Error processing {category}: {e}")

# --- Step 3: Combine results ---
all_categorical_gsea = [df for df in categorical_gsea_results.values() if not df.empty]
if all_categorical_gsea:
    combined_categorical_gsea = pd.concat(all_categorical_gsea, ignore_index=True)
    print(f"Total enriched terms across categories: {combined_categorical_gsea['Term'].nunique()}")
else:
    combined_categorical_gsea = pd.DataFrame()
    print("No enrichment terms found for categorical_GSEApy.")

# --- Optional: Rename columns for consistency ---
combined_categorical_gsea.rename(columns={
    'Term': 'GO_term',
    'Adjusted P-value': 'p_adj',
    'Overlap': 'Overlap',
    'Genes': 'Genes'
}, inplace=True)
Running Enrichr GO:BP enrichment for Antimetabolite (231 genes)...
 Found 2693 enriched terms for Antimetabolite
Running Enrichr GO:BP enrichment for DNA cross linking agent (247 genes)...
 Found 2937 enriched terms for DNA cross linking agent
Running Enrichr GO:BP enrichment for DNA strand break agent (342 genes)...
 Found 3406 enriched terms for DNA strand break agent
Running Enrichr GO:BP enrichment for Microtubule inhibitor (252 genes)...
 Found 2551 enriched terms for Microtubule inhibitor
Total enriched terms across categories: 6001
InĀ [1018]:
combined_categorical_gsea
Out[1018]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes category gene_count
0 KEGG_2021_Human Phenylalanine metabolism 3/17 0.000918 0.190844 0 0 18.566729 129.852632 ALDH3B2;AOC2;HPD Antimetabolite 3
1 KEGG_2021_Human Tyrosine metabolism 3/36 0.008201 0.498437 0 0 7.869219 37.800188 ALDH3B2;AOC2;HPD Antimetabolite 3
2 KEGG_2021_Human Pathways of neurodegeneration 12/475 0.009466 0.498437 0 0 2.284801 10.647343 FIG4;ERN1;TUBB8;PSMD6;KLC3;ATP2A3;UQCRFS1;WNT9... Antimetabolite 12
3 KEGG_2021_Human Various types of N-glycan biosynthesis 3/39 0.010235 0.498437 0 0 7.212354 33.046912 MAN2A2;HEXB;STT3B Antimetabolite 3
4 KEGG_2021_Human Protein processing in endoplasmic reticulum 6/171 0.014637 0.498437 0 0 3.168323 13.383670 ERN1;SSR2;SSR1;STT3B;PDIA4;TXNDC5 Antimetabolite 6
... ... ... ... ... ... ... ... ... ... ... ... ...
11582 GO_Molecular_Function_2025 Tubulin Binding (GO:0015631) 2/326 0.919016 0.934593 0 0 0.479605 0.040503 GAS8;DLGAP5 Microtubule inhibitor 2
11583 GO_Molecular_Function_2025 Protein Heterodimerization Activity (GO:0046982) 1/216 0.936314 0.948166 0 0 0.361957 0.023818 PPP2CA Microtubule inhibitor 1
11584 GO_Molecular_Function_2025 Cadherin Binding (GO:0045296) 1/317 0.982608 0.990865 0 0 0.244995 0.004298 CAPZB Microtubule inhibitor 1
11585 GO_Molecular_Function_2025 Olfactory Receptor Activity (GO:0004984) 1/379 0.992185 0.994195 0 0 0.204157 0.001602 OR2J3 Microtubule inhibitor 1
11586 GO_Molecular_Function_2025 Anion Binding (GO:0043168) 1/402 0.994195 0.994195 0 0 0.192219 0.001119 GSR Microtubule inhibitor 1

11587 rows Ɨ 12 columns

InĀ [Ā ]:
# Count the number of genes in 'Genes' column by counting semicolons + 1
combined_categorical_gsea['gene_count'] = combined_categorical_gsea['Genes'].str.count(';') + 1

# Keep only rows with at least 3 genes
filtered_cetegorical_gsea = combined_categorical_gsea[combined_categorical_gsea['gene_count'] >= 3].copy()

# # Optionally drop the temporary 'gene_count' column if no longer needed
filtered_categorical_gsea = filtered_cetegorical_gsea[filtered_cetegorical_gsea['P-value'] <= 0.05].copy()
InĀ [890]:
filtered_categorical_gsea
Out[890]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes category gene_count
0 KEGG_2021_Human Phenylalanine metabolism 3/17 0.000918 0.190844 0 0 18.566729 129.852632 ALDH3B2;AOC2;HPD Antimetabolite 3
1 KEGG_2021_Human Tyrosine metabolism 3/36 0.008201 0.498437 0 0 7.869219 37.800188 ALDH3B2;AOC2;HPD Antimetabolite 3
2 KEGG_2021_Human Pathways of neurodegeneration 12/475 0.009466 0.498437 0 0 2.284801 10.647343 FIG4;ERN1;TUBB8;PSMD6;KLC3;ATP2A3;UQCRFS1;WNT9... Antimetabolite 12
3 KEGG_2021_Human Various types of N-glycan biosynthesis 3/39 0.010235 0.498437 0 0 7.212354 33.046912 MAN2A2;HEXB;STT3B Antimetabolite 3
4 KEGG_2021_Human Protein processing in endoplasmic reticulum 6/171 0.014637 0.498437 0 0 3.168323 13.383670 ERN1;SSR2;SSR1;STT3B;PDIA4;TXNDC5 Antimetabolite 6
... ... ... ... ... ... ... ... ... ... ... ... ...
11355 GO_Molecular_Function_2025 Sequence-Specific Double-Stranded DNA Binding ... 15/706 0.034477 0.507452 0 0 1.745498 5.877905 OLIG3;CEBPE;RORC;DUX4;POU3F4;ELK4;SFPQ;IRF4;ZN... Microtubule inhibitor 15
11356 GO_Molecular_Function_2025 RNA Polymerase II Transcription Regulatory Reg... 23/1236 0.039813 0.507452 0 0 1.534702 4.947199 ZNF100;CEBPE;ZBTB16;RORC;ZNF3;ZNF25;DUX4;POU3F... Microtubule inhibitor 23
11357 GO_Molecular_Function_2025 Serine-Type Peptidase Activity (GO:0008236) 5/149 0.040583 0.507452 0 0 2.755848 8.830882 CLPP;HTRA4;PLAT;TTF2;PRSS12 Microtubule inhibitor 5
11358 GO_Molecular_Function_2025 RNA Polymerase II Cis-Regulatory Region Sequen... 20/1054 0.045036 0.507452 0 0 1.560228 4.837152 ZNF100;OLIG3;CEBPE;ZBTB16;RORC;DUX4;POU3F4;ELK... Microtubule inhibitor 20
11359 GO_Molecular_Function_2025 Guanyl-Nucleotide Exchange Factor Activity (GO... 6/206 0.046696 0.507452 0 0 2.383902 7.304500 PLEKHG3;PREX1;RABGEF1;ELMO1;ARHGEF4;FGD2 Microtubule inhibitor 6

333 rows Ɨ 12 columns

InĀ [887]:
# Count the number of genes in 'Genes' column by counting semicolons + 1
combined_other_gsea['gene_count'] = combined_other_gsea['Genes'].str.count(';') + 1

# Keep only rows with at least 3 genes
filtered_other_gsea = combined_other_gsea[combined_other_gsea['gene_count'] >= 3].copy()

# # Optionally drop the temporary 'gene_count' column if no longer needed
# filtered_gsea.drop(columns=['gene_count'], inplace=True)
filtered_other_gsea = filtered_other_gsea[filtered_other_gsea['P-value'] <= 0.05].copy()
InĀ [973]:
filtered_other_gsea 
Out[973]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes category gene_count
0 KEGG_2021_Human Human T-cell leukemia virus 1 infection 9/219 0.000333 0.046568 0 0 4.466489 35.769786 FOSL1;EGR1;MSX2;MYC;E2F1;TBPL1;SLC25A31;MSX1;C... Core 9
1 KEGG_2021_Human Maturity onset diabetes of the young 3/26 0.002071 0.144949 0 0 13.298969 82.185848 HNF4A;GCK;FOXA3 Core 3
2 KEGG_2021_Human Olfactory transduction 11/440 0.004375 0.204187 0 0 2.670802 14.507133 OR5M11;OR2C3;OR51E1;OR2AE1;OR2G3;OR6K2;OR1F1;O... Core 11
3 KEGG_2021_Human Chemical carcinogenesis 7/239 0.009653 0.250869 0 0 3.107917 14.422219 FGF8;KLF5;MYC;MGST3;E2F1;FGF20;CREB5 Core 7
4 KEGG_2021_Human Transcriptional misregulation in cancer 6/192 0.012044 0.250869 0 0 3.313123 14.641432 HOXA9;TLX3;PAX7;MYC;DDIT3;ETV4 Core 6
... ... ... ... ... ... ... ... ... ... ... ... ...
26308 GO_Molecular_Function_2025 Flavin Adenine Dinucleotide Binding (GO:0050660) 10/58 0.046237 0.539071 0 0 1.980594 6.088315 GCDH;AIFM1;MAOB;KDM1B;TXNRD1;ACOX3;KMO;SDHA;MT... Silenced ORFs 10
26309 GO_Molecular_Function_2025 Benzodiazepine Receptor Activity (GO:0008503) 3/9 0.046962 0.539071 0 0 4.746982 14.518244 GABRA6;GABRE;GABRG3 Silenced ORFs 3
26310 GO_Molecular_Function_2025 Tubulin-Glutamic Acid Ligase Activity (GO:0070... 3/9 0.046962 0.539071 0 0 4.746982 14.518244 TTLL6;TTLL10;TTLL2 Silenced ORFs 3
26311 GO_Molecular_Function_2025 Type II Transforming Growth Factor Beta Recept... 3/9 0.046962 0.539071 0 0 4.746982 14.518244 TGFB2;TGFB3;TGFBR1 Silenced ORFs 3
26312 GO_Molecular_Function_2025 Phospholipase Activator Activity (GO:0016004) 4/15 0.047930 0.541468 0 0 3.453209 10.490882 PDGFRB;BTK;FYN;ARHGAP6 Silenced ORFs 4

1536 rows Ɨ 12 columns

InĀ [892]:
# Combine row-wise
combined_filtered_gsea = pd.concat(
    [filtered_categorical_gsea, filtered_other_gsea],
    ignore_index=True
)
InĀ [894]:
import re

# Remove GO IDs in parentheses and trailing WP IDs
def clean_term(term):
    term = re.sub(r"\s*\(GO:\d+\)$", "", term)        # Remove GO IDs in parentheses
    term = re.sub(r"\s+WP\d+$", "", term)             # Remove trailing WP IDs
    return term.strip()

# filtered_other_gsea["GO_term"] = filtered_other_gsea["GO_term"].apply(clean_term)
combined_filtered_gsea["GO_term"] = combined_filtered_gsea["GO_term"].apply(clean_term)
InĀ [1019]:
combined_filtered_gsea
Out[1019]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes category gene_count
0 KEGG_2021_Human Phenylalanine metabolism 3/17 0.000918 0.190844 0 0 18.566729 129.852632 ALDH3B2;AOC2;HPD Antimetabolite 3
1 KEGG_2021_Human Tyrosine metabolism 3/36 0.008201 0.498437 0 0 7.869219 37.800188 ALDH3B2;AOC2;HPD Antimetabolite 3
2 KEGG_2021_Human Pathways of neurodegeneration 12/475 0.009466 0.498437 0 0 2.284801 10.647343 FIG4;ERN1;TUBB8;PSMD6;KLC3;ATP2A3;UQCRFS1;WNT9... Antimetabolite 12
3 KEGG_2021_Human Various types of N-glycan biosynthesis 3/39 0.010235 0.498437 0 0 7.212354 33.046912 MAN2A2;HEXB;STT3B Antimetabolite 3
4 KEGG_2021_Human Protein processing in endoplasmic reticulum 6/171 0.014637 0.498437 0 0 3.168323 13.383670 ERN1;SSR2;SSR1;STT3B;PDIA4;TXNDC5 Antimetabolite 6
... ... ... ... ... ... ... ... ... ... ... ... ...
1864 GO_Molecular_Function_2025 Flavin Adenine Dinucleotide Binding 10/58 0.046237 0.539071 0 0 1.980594 6.088315 GCDH;AIFM1;MAOB;KDM1B;TXNRD1;ACOX3;KMO;SDHA;MT... Silenced ORFs 10
1865 GO_Molecular_Function_2025 Benzodiazepine Receptor Activity 3/9 0.046962 0.539071 0 0 4.746982 14.518244 GABRA6;GABRE;GABRG3 Silenced ORFs 3
1866 GO_Molecular_Function_2025 Tubulin-Glutamic Acid Ligase Activity 3/9 0.046962 0.539071 0 0 4.746982 14.518244 TTLL6;TTLL10;TTLL2 Silenced ORFs 3
1867 GO_Molecular_Function_2025 Type II Transforming Growth Factor Beta Recept... 3/9 0.046962 0.539071 0 0 4.746982 14.518244 TGFB2;TGFB3;TGFBR1 Silenced ORFs 3
1868 GO_Molecular_Function_2025 Phospholipase Activator Activity 4/15 0.047930 0.541468 0 0 3.453209 10.490882 PDGFRB;BTK;FYN;ARHGAP6 Silenced ORFs 4

1869 rows Ɨ 12 columns

InĀ [1006]:
# Define keywords of interest
keywords = ["Transcriptional misregulation in cancer", 
            "TGF-beta signaling pathway", 
            "SNARE interactions in vesicular transport", 
            "p53 signaling pathway", "Phagosome", "Ferroptosis", 
            "Biosynthesis of unsaturated fatty acids", 
            "Tight junction",
            "AMPK signaling pathway", "PPAR signaling pathway",
            # "RNA Polymerase II Transcription Termination",
            "Regulation of DNA Damage Response, Signal Transduction by p53 Class Mediator",
            "Positive Regulation of Cell Cycle",
            "Wnt Signaling Pathway",
            "Pyroptosis",
            "PI3K AKT Signaling in Cancer",
            "PI3K-Akt signaling pathway",
            "Toll-like Receptor Signaling Pathway",
            # "Regulation of ERK1 and ERK2 Cascade",
            "Positive regulation of Autophagy",
            # "Cytochrome P450 pathway",
            # "MAPK signaling pathway",
            "MAP Kinase Activation",
            "CREB phosphorylation",
            "Protein kinase Binding",
            # "Regulation of Autophagy",
            "Hippo signaling pathway",
            "Activation of BH3-only proteins",
            "Intrinsic Pathway for Apoptosis",
            "Regulation of Double-Strand Break Repair",
            "Positive Regulation of Cell Cycle",
            "Calcium signaling Pathway",
            "GnRH signaling pathway",
            "Toll-like receptor signaling pathway",
            "RAF activation",
            "Purine Ribonulcleotide Biosynthetic Process",
            "Regulated Necrosis"
            
            
]

# # Filter terms that contain any of the keywords (case insensitive)
# relevant_gsea = filtered_other_gsea[
#     filtered_other_gsea["GO_term"].str.contains("|".join(keywords), case=False, na=False)
# ]

# relevant_gsea = filtered_other_gsea[
#     filtered_other_gsea["GO_term"].isin(keywords)
# ]

relevant_gsea = combined_filtered_gsea[
    combined_filtered_gsea["GO_term"].isin(keywords)
]
InĀ [1008]:
relevant_gsea 
Out[1008]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes category gene_count
6 KEGG_2021_Human PI3K-Akt signaling pathway 9/354 0.022288 0.498437 0 0 2.282491 8.681976 COL2A1;EFNA2;LPAR6;PKN2;ITGA6;SOS2;YWHAZ;TLR2;... Antimetabolite 9
65 KEGG_2021_Human Hippo signaling pathway 8/163 0.000971 0.095598 0 0 4.232258 29.361982 LATS1;BMP2;DLG3;YWHAQ;YWHAB;TP53BP2;WNT9B;ACTG1 DNA cross linking agent 8
69 KEGG_2021_Human PI3K-Akt signaling pathway 9/354 0.032351 0.653581 0 0 2.127293 7.299007 MAP2K1;EFNA3;YWHAQ;YWHAB;ITGA2;F2R;PRKCA;ITGA5... DNA cross linking agent 9
100 Reactome_Pathways_2024 Intrinsic Pathway for Apoptosis 3/55 0.030494 0.523092 0 0 4.658181 16.258082 YWHAQ;YWHAB;TP53BP2 DNA cross linking agent 3
128 GO_Biological_Process_2025 Regulation of Double-Strand Break Repair 6/91 0.000927 0.258265 0 0 5.760703 40.228089 ING3;KDM1A;SPIRE2;DMAP1;FOXM1;BRD7 DNA cross linking agent 6
145 GO_Biological_Process_2025 Positive Regulation of Cell Cycle 4/60 0.006445 0.378306 0 0 5.789830 29.206633 FOXA1;TAL1;PRKCA;AURKA DNA cross linking agent 4
208 KEGG_2021_Human GnRH signaling pathway 5/93 0.021734 0.668100 0 0 3.299501 12.633427 CAMK2D;JMJD7-PLA2G4B;GNAQ;GNRH2;PRKACB DNA strand break agent 5
212 KEGG_2021_Human Toll-like receptor signaling pathway 5/104 0.033121 0.668100 0 0 2.931241 9.988457 CD40;IFNA7;CCL4;MAP3K8;FADD DNA strand break agent 5
273 KEGG_2021_Human TGF-beta signaling pathway 5/94 0.006747 0.950090 0 0 4.471410 22.350815 PPP2CA;E2F4;BMPR1B;RHOA;BMP5 Microtubule inhibitor 5
276 KEGG_2021_Human Transcriptional misregulation in cancer 6/192 0.035084 0.950090 0 0 2.565172 8.593333 ELK4;CEBPE;ZBTB16;TSPAN7;PLAT;KDM6A Microtubule inhibitor 6
337 KEGG_2021_Human Transcriptional misregulation in cancer 6/192 0.012044 0.250869 0 0 3.313123 14.641432 HOXA9;TLX3;PAX7;MYC;DDIT3;ETV4 Core 6
339 KEGG_2021_Human TGF-beta signaling pathway 4/94 0.013992 0.250869 0 0 4.539551 19.380511 TFDP1;MYC;PITX2;RGMA Core 4
479 GO_Biological_Process_2025 Positive Regulation of Cell Cycle 3/60 0.021360 0.191283 0 0 5.357027 20.604474 TCF7L1;MSX1;MEIS2 Core 3
489 GO_Biological_Process_2025 Wnt Signaling Pathway 3/83 0.048751 0.291727 0 0 3.812436 11.517512 TCF7L1;SNAI1;GATA3 Core 3
507 KEGG_2021_Human SNARE interactions in vesicular transport 19/33 0.002895 0.307884 0 0 2.788195 16.295910 STX17;GOSR2;GOSR1;SNAP23;STX18;STX7;STX1B;STX6... Multidrug 19
508 KEGG_2021_Human p53 signaling pathway 35/73 0.004970 0.317120 0 0 1.893518 10.043929 CDKN1A;CD82;EI24;PPM1D;RCHY1;BBC3;CCND3;CASP8;... Multidrug 35
514 KEGG_2021_Human TGF-beta signaling pathway 41/94 0.017958 0.572849 0 0 1.590035 6.391524 BMPR2;AMHR2;ACVR1B;LTBP1;PPP2CB;ACVR1C;PPP2R1A... Multidrug 41
523 KEGG_2021_Human Phagosome 60/152 0.048190 0.737977 0 0 1.340490 4.065178 ITGAM;TFRC;ITGB5;NCF2;ITGB2;TCIRG1;MPO;CTSS;FC... Multidrug 60
525 KEGG_2021_Human Ferroptosis 19/41 0.048582 0.737977 0 0 1.773249 5.363212 PRNP;TFRC;GPX4;ALOX15;SLC40A1;ACSL6;ACSL5;CYBB... Multidrug 19
539 Reactome_Pathways_2024 Intrinsic Pathway for Apoptosis 27/55 0.008696 0.724412 0 0 1.981445 9.401659 AVEN;DIABLO;APIP;UACA;BBC3;CASP8;AKT2;C1QBP;AK... Multidrug 27
548 Reactome_Pathways_2024 Pyroptosis 15/27 0.012153 0.724412 0 0 2.566886 11.320440 GSDMD;HMGB1;IL1B;IRF1;CASP4;CHMP2B;IRF2;CASP1;... Multidrug 15
569 Reactome_Pathways_2024 PI3K AKT Signaling in Cancer 47/110 0.018145 0.724412 0 0 1.533669 6.149036 CD86;CDKN1A;GSK3A;IRS1;SRC;CD80;MAPKAP1;PDGFB;... Multidrug 47
580 Reactome_Pathways_2024 Regulated Necrosis 28/61 0.022122 0.724412 0 0 1.743111 6.643335 HMGB1;SDCBP;CASP8;UBB;CASP4;CASP1;RIPK1;BAK1;R... Multidrug 28
888 GO_Biological_Process_2025 Wnt Signaling Pathway 35/83 0.045723 0.999980 0 0 1.497917 4.621308 LEF1;TCF7;DIXDC1;WNT8A;KLHL12;WNT6;PORCN;FRAT2... Multidrug 35
950 KEGG_2021_Human Biosynthesis of unsaturated fatty acids 8/27 0.030155 0.999994 0 0 2.539061 8.890254 ACOT7;ELOVL5;ELOVL3;SCD5;ACOT2;ACOT1;ELOVL6;ACOX3 Non-Respondent 8
951 KEGG_2021_Human Tight junction 33/169 0.035270 0.999994 0 0 1.466136 4.903833 IGSF5;ARPC5L;PRKAG2;PARD6G;CD1D;F11R;CD1B;AMOT... Non-Respondent 33
1176 KEGG_2021_Human AMPK signaling pathway 24/120 0.000365 0.056282 0 0 2.388004 18.899482 PFKFB1;CPT1A;PFKFB3;TSC2;PIK3R3;PRKAG2;PPP2R5A... Silenced ORFs 24
1220 KEGG_2021_Human PPAR signaling pathway 12/74 0.046737 0.308869 0 0 1.840547 5.638008 CPT1A;SCD;ACSL6;ILK;PPARG;ACADM;CD36;HMGCS2;AC... Silenced ORFs 12
1272 Reactome_Pathways_2024 MAP Kinase Activation 13/63 0.005824 0.192571 0 0 2.475420 12.738038 ATF1;ATF2;MAP2K1;MEF2C;FBXW11;RIPK2;RPS6KA3;MA... Silenced ORFs 13
InĀ [1009]:
# Define the desired order
category_order = [
    "Core",
    "Antimetabolite",
    "DNA cross linking agent",
    "DNA strand break agent",
    "Microtubule inhibitor",
    "Multidrug",
    "Non-Respondent",
    "Silenced ORFs"
]

# Convert 'category' to categorical with specified order
relevant_gsea['category'] = pd.Categorical(
    relevant_gsea['category'],
    categories=category_order,
    ordered=True
)

# Sort by the ordered category
relevant_gsea = relevant_gsea.sort_values('category').reset_index(drop=True)
/tmp/ipykernel_1548459/2888785418.py:14: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

InĀ [1010]:
relevant_gsea 
Out[1010]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes category gene_count
0 GO_Biological_Process_2025 Wnt Signaling Pathway 3/83 0.048751 0.291727 0 0 3.812436 11.517512 TCF7L1;SNAI1;GATA3 Core 3
1 GO_Biological_Process_2025 Positive Regulation of Cell Cycle 3/60 0.021360 0.191283 0 0 5.357027 20.604474 TCF7L1;MSX1;MEIS2 Core 3
2 KEGG_2021_Human Transcriptional misregulation in cancer 6/192 0.012044 0.250869 0 0 3.313123 14.641432 HOXA9;TLX3;PAX7;MYC;DDIT3;ETV4 Core 6
3 KEGG_2021_Human TGF-beta signaling pathway 4/94 0.013992 0.250869 0 0 4.539551 19.380511 TFDP1;MYC;PITX2;RGMA Core 4
4 KEGG_2021_Human PI3K-Akt signaling pathway 9/354 0.022288 0.498437 0 0 2.282491 8.681976 COL2A1;EFNA2;LPAR6;PKN2;ITGA6;SOS2;YWHAZ;TLR2;... Antimetabolite 9
5 KEGG_2021_Human Hippo signaling pathway 8/163 0.000971 0.095598 0 0 4.232258 29.361982 LATS1;BMP2;DLG3;YWHAQ;YWHAB;TP53BP2;WNT9B;ACTG1 DNA cross linking agent 8
6 KEGG_2021_Human PI3K-Akt signaling pathway 9/354 0.032351 0.653581 0 0 2.127293 7.299007 MAP2K1;EFNA3;YWHAQ;YWHAB;ITGA2;F2R;PRKCA;ITGA5... DNA cross linking agent 9
7 Reactome_Pathways_2024 Intrinsic Pathway for Apoptosis 3/55 0.030494 0.523092 0 0 4.658181 16.258082 YWHAQ;YWHAB;TP53BP2 DNA cross linking agent 3
8 GO_Biological_Process_2025 Regulation of Double-Strand Break Repair 6/91 0.000927 0.258265 0 0 5.760703 40.228089 ING3;KDM1A;SPIRE2;DMAP1;FOXM1;BRD7 DNA cross linking agent 6
9 GO_Biological_Process_2025 Positive Regulation of Cell Cycle 4/60 0.006445 0.378306 0 0 5.789830 29.206633 FOXA1;TAL1;PRKCA;AURKA DNA cross linking agent 4
10 KEGG_2021_Human GnRH signaling pathway 5/93 0.021734 0.668100 0 0 3.299501 12.633427 CAMK2D;JMJD7-PLA2G4B;GNAQ;GNRH2;PRKACB DNA strand break agent 5
11 KEGG_2021_Human Toll-like receptor signaling pathway 5/104 0.033121 0.668100 0 0 2.931241 9.988457 CD40;IFNA7;CCL4;MAP3K8;FADD DNA strand break agent 5
12 KEGG_2021_Human Transcriptional misregulation in cancer 6/192 0.035084 0.950090 0 0 2.565172 8.593333 ELK4;CEBPE;ZBTB16;TSPAN7;PLAT;KDM6A Microtubule inhibitor 6
13 KEGG_2021_Human TGF-beta signaling pathway 5/94 0.006747 0.950090 0 0 4.471410 22.350815 PPP2CA;E2F4;BMPR1B;RHOA;BMP5 Microtubule inhibitor 5
14 GO_Biological_Process_2025 Wnt Signaling Pathway 35/83 0.045723 0.999980 0 0 1.497917 4.621308 LEF1;TCF7;DIXDC1;WNT8A;KLHL12;WNT6;PORCN;FRAT2... Multidrug 35
15 Reactome_Pathways_2024 Regulated Necrosis 28/61 0.022122 0.724412 0 0 1.743111 6.643335 HMGB1;SDCBP;CASP8;UBB;CASP4;CASP1;RIPK1;BAK1;R... Multidrug 28
16 Reactome_Pathways_2024 PI3K AKT Signaling in Cancer 47/110 0.018145 0.724412 0 0 1.533669 6.149036 CD86;CDKN1A;GSK3A;IRS1;SRC;CD80;MAPKAP1;PDGFB;... Multidrug 47
17 Reactome_Pathways_2024 Pyroptosis 15/27 0.012153 0.724412 0 0 2.566886 11.320440 GSDMD;HMGB1;IL1B;IRF1;CASP4;CHMP2B;IRF2;CASP1;... Multidrug 15
18 Reactome_Pathways_2024 Intrinsic Pathway for Apoptosis 27/55 0.008696 0.724412 0 0 1.981445 9.401659 AVEN;DIABLO;APIP;UACA;BBC3;CASP8;AKT2;C1QBP;AK... Multidrug 27
19 KEGG_2021_Human SNARE interactions in vesicular transport 19/33 0.002895 0.307884 0 0 2.788195 16.295910 STX17;GOSR2;GOSR1;SNAP23;STX18;STX7;STX1B;STX6... Multidrug 19
20 KEGG_2021_Human Phagosome 60/152 0.048190 0.737977 0 0 1.340490 4.065178 ITGAM;TFRC;ITGB5;NCF2;ITGB2;TCIRG1;MPO;CTSS;FC... Multidrug 60
21 KEGG_2021_Human TGF-beta signaling pathway 41/94 0.017958 0.572849 0 0 1.590035 6.391524 BMPR2;AMHR2;ACVR1B;LTBP1;PPP2CB;ACVR1C;PPP2R1A... Multidrug 41
22 KEGG_2021_Human p53 signaling pathway 35/73 0.004970 0.317120 0 0 1.893518 10.043929 CDKN1A;CD82;EI24;PPM1D;RCHY1;BBC3;CCND3;CASP8;... Multidrug 35
23 KEGG_2021_Human Ferroptosis 19/41 0.048582 0.737977 0 0 1.773249 5.363212 PRNP;TFRC;GPX4;ALOX15;SLC40A1;ACSL6;ACSL5;CYBB... Multidrug 19
24 KEGG_2021_Human Biosynthesis of unsaturated fatty acids 8/27 0.030155 0.999994 0 0 2.539061 8.890254 ACOT7;ELOVL5;ELOVL3;SCD5;ACOT2;ACOT1;ELOVL6;ACOX3 Non-Respondent 8
25 KEGG_2021_Human Tight junction 33/169 0.035270 0.999994 0 0 1.466136 4.903833 IGSF5;ARPC5L;PRKAG2;PARD6G;CD1D;F11R;CD1B;AMOT... Non-Respondent 33
26 KEGG_2021_Human PPAR signaling pathway 12/74 0.046737 0.308869 0 0 1.840547 5.638008 CPT1A;SCD;ACSL6;ILK;PPARG;ACADM;CD36;HMGCS2;AC... Silenced ORFs 12
27 KEGG_2021_Human AMPK signaling pathway 24/120 0.000365 0.056282 0 0 2.388004 18.899482 PFKFB1;CPT1A;PFKFB3;TSC2;PIK3R3;PRKAG2;PPP2R5A... Silenced ORFs 24
28 Reactome_Pathways_2024 MAP Kinase Activation 13/63 0.005824 0.192571 0 0 2.475420 12.738038 ATF1;ATF2;MAP2K1;MEF2C;FBXW11;RIPK2;RPS6KA3;MA... Silenced ORFs 13
InĀ [Ā ]:
from textwrap import wrap

import matplotlib.colors as mcolors
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Ensure 'Combined Score' is float
relevant_gsea['Combined Score'] = pd.to_numeric(relevant_gsea['Combined Score'], errors='coerce')
relevant_gsea = relevant_gsea.dropna(subset=['Combined Score'])
relevant_gsea['Odds Ratio'] = relevant_gsea['Odds Ratio'].round().astype(int)

# Add log10 P-value
relevant_gsea['log10_P-value'] = -np.log10(relevant_gsea['P-value'])

# -----------------------------
# Wrap y-axis labels if too long
# -----------------------------
median_len = int(relevant_gsea['GO_term'].str.len().median())
median_len = 30

def wrap_label(label, max_len=median_len):
    """Split GO terms into 2 lines if longer than median length."""
    if len(label) > max_len:
        return "\n".join(wrap(label, max_len))
    return label

relevant_gsea['GO_term_wrapped'] = relevant_gsea['GO_term'].apply(wrap_label)

# -----------------------------
# Count overlaps per GO term
# -----------------------------

# Add counts to labels (optional, can remove if not wanted)
go_counts = relevant_gsea.groupby('GO_term_wrapped')['category'].nunique().sort_values(ascending=True)

# Reorder categorical axis (descending by overlap)
relevant_gsea['GO_term_wrapped'] = pd.Categorical(
    relevant_gsea['GO_term_wrapped'],
    categories=go_counts.index,
    ordered=False
)

# Plot
plt.figure(figsize=(25, 35
                    ))
scatter = sns.scatterplot(
    data=relevant_gsea,
    x='category',
    y='GO_term',
    size='Odds Ratio',
    hue='log10_P-value',
    palette='RdPu',
    sizes=(350, 2000),
    edgecolor='black',
    linewidth=1,
    legend=False  # We'll add custom legends
)

# Custom size legend (Odds Ratio)
size_values = np.linspace(relevant_gsea['Odds Ratio'].min(), relevant_gsea['Odds Ratio'].max(), 3)
markers = []
labels = []
for size in size_values:
    markers.append(plt.scatter([], [], 
                               s=(50 + (size - size_values.min()) / 
                                  (size_values.max() - size_values.min()) * (2000-350)),
                               color='gray', edgecolor='black'))
    labels.append(f"{size:.2f}")
plt.legend(markers, labels, title='Odds Ratio', bbox_to_anchor=(1, 1), loc='upper left')

# Colorbar for -log10(P-value), smaller and moved
norm = mcolors.Normalize(vmin=relevant_gsea['log10_P-value'].min(),
                         vmax=relevant_gsea['log10_P-value'].max())
sm = plt.cm.ScalarMappable(cmap='RdPu', norm=norm)
sm.set_array([])

# Use fraction to control size and pad to move it
cbar = plt.colorbar(sm, ax=scatter, fraction=0.04, pad=0.05)
cbar.set_label('-log10(P-value)', rotation=270, labelpad=40)

# Aesthetics
plt.xticks(rotation=45, ha='right')
plt.tick_params(axis='both', which='major', width=1, length=10) 
plt.ylabel('')
plt.xlabel('')
plt.grid(False)
plt.tight_layout()

scatter.set_ylim(len(relevant_gsea['GO_term_wrapped'].unique()) - 0.5, -0.5)

plt.savefig("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Dotplot_all.svg", format='svg', dpi=2000)

plt.show()
No description has been provided for this image
InĀ [961]:
# Count the number of genes in 'Genes' column by counting semicolons + 1
combined_gsea['gene_count'] = combined_gsea['Genes'].str.count(';') + 1

# Keep only rows with at least 3 genes
filtered_gsea = combined_gsea[combined_gsea['gene_count'] >= 3].copy()

# # Optionally drop the temporary 'gene_count' column if no longer needed
# filtered_gsea.drop(columns=['gene_count'], inplace=True)

filtered_gsea = filtered_gsea[filtered_gsea['P-value'] <= 0.05].copy()
InĀ [962]:
import re

# Remove GO IDs in parentheses and trailing WP IDs
def clean_term(term):
    term = re.sub(r"\s*\(GO:\d+\)$", "", term)        # Remove GO IDs in parentheses
    term = re.sub(r"\s+WP\d+$", "", term)             # Remove trailing WP IDs
    return term.strip()

filtered_gsea["GO_term"] = filtered_gsea["GO_term"].apply(clean_term)
InĀ [963]:
filtered_gsea
Out[963]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes drug gene_count
0 KEGG_2021_Human TGF-beta signaling pathway 19/94 0.000308 0.095476 0 0 2.744796 22.192908 TGIF1;TGFB2;INHBB;INHBA;ACVR2B;BMP7;SMAD5;DCN;... Paclitaxel 19
1 KEGG_2021_Human RNA degradation 14/79 0.006392 0.711796 0 0 2.327992 11.762601 LSM1;LSM5;TOB1;PATL1;HSPD1;LSM2;EDC4;CNOT4;LSM... Paclitaxel 14
2 KEGG_2021_Human PI3K-Akt signaling pathway 44/354 0.007054 0.711796 0 0 1.540904 7.633904 PHLPP1;TNXB;CHRM1;CSF1;LAMC3;IFNA2;PDGFA;PIK3C... Paclitaxel 44
3 KEGG_2021_Human Hematopoietic cell lineage 16/99 0.009184 0.711796 0 0 2.083987 9.774403 CSF1;GP1BB;ITGA2;FLT3LG;CD3G;DNTT;EPOR;KITLG;C... Paclitaxel 16
4 KEGG_2021_Human ECM-receptor interaction 14/88 0.016214 0.907660 0 0 2.043849 8.424479 TNXB;LAMC3;GP1BB;ITGA2;DMP1;LAMC2;THBS2;IBSP;C... Paclitaxel 14
... ... ... ... ... ... ... ... ... ... ... ... ...
132680 GO_Molecular_Function_2025 Secondary Active Transmembrane Transporter Act... 11/58 0.046917 0.809614 0 0 1.915785 5.861105 SLC7A5;SLC35A1;CLCN6;SLC22A6;SLC46A2;SLC35D1;S... TAS102 11
132681 GO_Molecular_Function_2025 Activin Receptor Activity 3/8 0.047677 0.809614 0 0 4.904911 14.927115 ACVRL1;ACVR1C;TGFBR2 TAS102 3
132682 GO_Molecular_Function_2025 Double-Stranded Telomeric DNA Binding 3/8 0.047677 0.809614 0 0 4.904911 14.927115 XRCC6;TERF1;TERF2 TAS102 3
132683 GO_Molecular_Function_2025 Tumor Necrosis Factor Receptor Activity 3/8 0.047677 0.809614 0 0 4.904911 14.927115 TNFRSF14;TNFRSF25;TNFRSF4 TAS102 3
132684 GO_Molecular_Function_2025 Postsynaptic Neurotransmitter Receptor Activity 5/19 0.048587 0.809788 0 0 2.920795 8.833634 CHRNB2;CHRNA2;CHRNB3;CHRNA4;CHRNA6 TAS102 5

6122 rows Ɨ 12 columns

InĀ [964]:
drug_to_category = {}
for cat, drugs in drug_category.items():
    for d in drugs:
        drug_to_category[d] = cat
InĀ [965]:
filtered_gsea['category'] = filtered_gsea['drug'].map(drug_to_category)
InĀ [966]:
go_by_cat = (
    filtered_gsea.groupby('GO_term')
    .agg({
        'drug': lambda x: set(x),
        'category': lambda x: set(x),
        'Gene_set': lambda x: set(x)
    })
    .reset_index()
)

# Count number of drugs and categories per GO term
go_by_cat['num_drugs'] = go_by_cat['drug'].apply(len)
go_by_cat['num_categories'] = go_by_cat['category'].apply(len)

# --- Keep only overlaps WITHIN a single category ---
within_cat_go = go_by_cat[
    (go_by_cat['num_drugs'] > 1) & (go_by_cat['num_categories'] >= 1)
].copy()

# Sort so biggest overlaps appear first
within_cat_go = within_cat_go.sort_values(['num_drugs'], ascending=False)
InĀ [967]:
within_cat_go
Out[967]:
GO_term drug category Gene_set num_drugs num_categories
3036 Sequence-Specific DNA Binding {FdU, Cisplatin, Carboplatin, TAS102, Doxorubi... {Antimetabolite, Microtubule inhibitor, DNA cr... {GO_Molecular_Function_2025} 13 4
3037 Sequence-Specific Double-Stranded DNA Binding {FdU, Cisplatin, Carboplatin, TAS102, Doxorubi... {Antimetabolite, Microtubule inhibitor, DNA cr... {GO_Molecular_Function_2025} 13 4
670 Double-Stranded DNA Binding {FdU, Cisplatin, Carboplatin, TAS102, Doxorubi... {Antimetabolite, Microtubule inhibitor, DNA cr... {GO_Molecular_Function_2025} 13 4
3523 bHLH Transcription Factor Binding {FdU, Carboplatin, TAS102, Carmustine, Irinote... {Antimetabolite, Microtubule inhibitor, DNA cr... {GO_Molecular_Function_2025} 12 4
589 DNA-binding Transcription Activator Activity, ... {FdU, Cisplatin, Carboplatin, TAS102, Carmusti... {Antimetabolite, Microtubule inhibitor, DNA cr... {GO_Molecular_Function_2025} 11 4
... ... ... ... ... ... ...
54 Acetylcholine binding and downstream events {Etoposide, TAS102} {Antimetabolite, DNA strand break agent} {BioPlanet_2019} 2 2
55 Acetylcholine-Gated Monoatomic Cation-Selectiv... {Etoposide, TAS102} {Antimetabolite, DNA strand break agent} {GO_Molecular_Function_2025} 2 2
31 AKT Phosphorylates Targets in the Cytosol {Bleomycin, TAS102} {Antimetabolite, DNA strand break agent} {Reactome_Pathways_2024} 2 2
59 Acidic Amino Acid Transmembrane Transporter Ac... {5FU, Vinblastine} {Antimetabolite, Microtubule inhibitor} {GO_Molecular_Function_2025} 2 2
65 Activated Point Mutants of FGFR2 {Etoposide, Vinblastine} {Microtubule inhibitor, DNA strand break agent} {Reactome_Pathways_2024} 2 2

1327 rows Ɨ 6 columns

InĀ [968]:
# Define words/phrases you want to exclude
exclude_keywords = [
    "development", "Heart", "Neuron", "Morphogenesis", "Muscle", "T cell", "T-helper", "synaptic", 
    "neurogenesis", "Beta", "SARS", "Olfactory", "Gatrulation", "carcinoma", "tuberculosis", "breast", "gastric", "stem", "leukemia", 
    "endometrial", "pancreatic", "circadian rhythm", "melanoma", "colorectal", "osteoclast", "bladder", "melanogenesis",
    "cushing", "bladder", "virus", "neomycin", "RNA", "butanoate", "leishmaniasis", "glioma", "immunodeficiency", "vibrio", "cysteine", "Histidine",
    "fatty", "Xenobiotic", "biosynthesis", "sugar", "cytokine", "Gastrulation", "Nephrogenesis"
]

# Build regex pattern for case-insensitive search
pattern = "|".join(exclude_keywords)

# Keep only GO terms that do NOT contain those words
within_cat_go_filtered = within_cat_go[
    ~within_cat_go['GO_term'].str.contains(pattern, case=False, na=False)
].copy()
InĀ [994]:
# Define keywords of interest
keywords = ["Hippo signaling pathway", 
            "Activation of BH3-only Proteins", 
            "TGF-beta signaling pathway", 
            "Cellular senescence", 
            "Wnt signaling pathway", 
            "SNARE interactions in vesicular transport", 
            # "Central carbon metabolism in cancer", "Drug metabolism", "Proteoglycans in cancer", "inflmmatory mediator regulation of TRP channels",
            # "PI3K-Akt signaling pathway", 
            "Ferroptosis", 
            "NF-kappa B signaling pathway", 
            "Loss of Function of SMAD2 3 in cancer", 
            "Intrinsic Pathway for Apoptosis", "RAF Activation", 
            "TP53 Regulates Transcription of Genes Involved in G1 Cell Cycle Arrest", 
            "Pyroptosis",
            "Positive Regulation of Cell Population Proliferation", 
            "Phagolysosome Assembly", 
            "Autophagosome Maturation", 
            "Positive Regulation of Autophagy", 
            "Phagolysosome Assembly",
            "Stathmin and breast cancer resistance to antimicrotubule agents",
            "PI3K AKT Signaling in Cancer",
            "Myc active pathway",
            "Regulation of KIT Signaling",
            "Negative regulation of Telomere Maintenance",
            # "Chk1 Chk2(Cds1) Mediated Inactivation of Cyclin B Cdk1 Complex",
            "Regulated Necrosis",
            "Tumor Necrosis Factor Receptor Activity",
            "Pyrimidine metabolism",
            ]

# peripheral_gsea = filtered_gsea[
#     filtered_gsea["GO_term"].str.contains("|".join(keywords), case=False, na=False)
# ]

peripheral_gsea = filtered_gsea[
    filtered_gsea["GO_term"].isin(keywords)
]
InĀ [981]:
peripheral_gsea["drug"].unique()
Out[981]:
['Paclitaxel', 'Cisplatin', 'TFT', 'FdU', 'EdU', ..., 'Carmustine', 'Irinotecan', '6mercaptopurine', 'Vinblastine', 'TAS102']
Length: 16
Categories (16, object): ['TFT' < 'TAS102' < 'FdU' < 'EdU' ... 'Irinotecan' < 'Bleomycin' < 'Paclitaxel' < 'Vinblastine']
InĀ [976]:
import matplotlib.colors as mcolors
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Ensure 'Combined Score' is float
peripheral_gsea['Combined Score'] = pd.to_numeric(peripheral_gsea['Combined Score'], errors='coerce')
peripheral_gsea = peripheral_gsea.dropna(subset=['Combined Score'])
peripheral_gsea['Odds Ratio'] = peripheral_gsea['Odds Ratio'].round().astype(int)

# Add log10 P-value
peripheral_gsea['log10_P-value'] = -np.log10(peripheral_gsea['P-value'])

peripheral_gsea['drug'] = pd.Categorical(
    peripheral_gsea['drug'],
    categories=ordered_drug_names,
    ordered=True
)

# Plot
plt.figure(figsize=(30, 15))
scatter = sns.scatterplot(
    data=peripheral_gsea,
    x='drug',
    y='GO_term',
    size='Odds Ratio',
    hue='log10_P-value',
    palette='cool',
    sizes=(150, 1000),
    edgecolor='black',
    linewidth=1,
    legend=False  # We'll add custom legends
)

# Custom size legend (Odds Ratio)
size_values = np.linspace(peripheral_gsea['Odds Ratio'].min(), peripheral_gsea['Odds Ratio'].max(), 3)
markers = []
labels = []
for size in size_values:
    markers.append(plt.scatter([], [], 
                               s=(50 + (size - size_values.min()) / 
                                  (size_values.max() - size_values.min()) * (1000-150)),
                               color='gray', edgecolor='black'))
    labels.append(f"{size:.2f}")
plt.legend(markers, labels, title='Odds Ratio', bbox_to_anchor=(1, 1), loc='upper left')

# Colorbar for -log10(P-value), smaller and moved
norm = mcolors.Normalize(vmin=peripheral_gsea['log10_P-value'].min(),
                         vmax=peripheral_gsea['log10_P-value'].max())
sm = plt.cm.ScalarMappable(cmap='cool', norm=norm)
sm.set_array([])

# Use fraction to control size and pad to move it
cbar = plt.colorbar(sm, ax=scatter, fraction=0.03, pad=0.05)
cbar.set_label('-log10(P-value)', rotation=270, labelpad=40)

# Aesthetics
plt.xticks(rotation=45, ha='right')
plt.tick_params(axis='both', which='major', width=1, length=10) 
plt.ylabel('')
plt.xlabel('')
plt.grid(False)
plt.tight_layout()
plt.show()
/tmp/ipykernel_1548459/3074881863.py:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

No description has been provided for this image
InĀ [1017]:
from textwrap import wrap

# Ensure 'Combined Score' is float
peripheral_gsea['Combined Score'] = pd.to_numeric(peripheral_gsea['Combined Score'], errors='coerce')
peripheral_gsea = peripheral_gsea.dropna(subset=['Combined Score'])
peripheral_gsea['Odds Ratio'] = peripheral_gsea['Odds Ratio'].round().astype(int)

# Add log10 P-value
peripheral_gsea['log10_P-value'] = -np.log10(peripheral_gsea['P-value'])

# Order drug categories if provided
peripheral_gsea['drug'] = pd.Categorical(
    peripheral_gsea['drug'],
    categories=ordered_drug_names,
    ordered=True
)

# -----------------------------
# Wrap y-axis labels if too long
# -----------------------------
median_len = int(peripheral_gsea['GO_term'].str.len().median())
median_len = 35

def wrap_label(label, max_len=median_len):
    """Split GO terms into 2 lines if longer than median length."""
    if len(label) > max_len:
        return "\n".join(wrap(label, max_len))
    return label

peripheral_gsea['GO_term_wrapped'] = peripheral_gsea['GO_term'].apply(wrap_label)

# -----------------------------
# Count overlaps per GO term
# -----------------------------

# Add counts to labels (optional, can remove if not wanted)
go_counts = peripheral_gsea.groupby('GO_term_wrapped')['drug'].nunique().sort_values(ascending=False)

# Reorder categorical axis (descending by overlap)
peripheral_gsea['GO_term_wrapped'] = pd.Categorical(
    peripheral_gsea['GO_term_wrapped'],
    categories=go_counts.index,
    ordered=True
)

# -----------------------------
# Plot
# -----------------------------
plt.figure(figsize=(40, 35))
scatter = sns.scatterplot(
    data=peripheral_gsea,
    x='drug',
    y='GO_term_wrapped',
    size='Odds Ratio',
    hue='log10_P-value',
    palette='RdPu',
    sizes=(350, 2000),
    edgecolor='black',
    linewidth=1,
    legend=False  # We'll add custom legends
)

# Custom size legend (Odds Ratio)
size_values = np.linspace(peripheral_gsea['Odds Ratio'].min(), peripheral_gsea['Odds Ratio'].max(), 3)
markers = []
labels = []
for size in size_values:
    markers.append(plt.scatter([], [], 
                               s=(50 + (size - size_values.min()) / 
                                  (size_values.max() - size_values.min()) * (2000-350)),
                               color='gray', edgecolor='black'))
    labels.append(f"{size:.2f}")
plt.legend(markers, labels, title='Odds Ratio', bbox_to_anchor=(1, 1), loc='upper left')

# Colorbar for -log10(P-value)
norm = mcolors.Normalize(vmin=peripheral_gsea['log10_P-value'].min(),
                         vmax=peripheral_gsea['log10_P-value'].max())
sm = plt.cm.ScalarMappable(cmap='RdPu', norm=norm)
sm.set_array([])

cbar = plt.colorbar(sm, ax=scatter, fraction=0.04, pad=0.05)
cbar.set_label('-log10(P-value)', rotation=270, labelpad=30)
cbar.locator = plt.MaxNLocator(nbins=9)  # automatically choose ticks
cbar.update_ticks()

# Aesthetics
plt.xticks(rotation=45, ha='right')
plt.tick_params(axis='both', which='major', width=1, length=10) 
plt.ylabel('')
plt.xlabel('')
plt.grid(False)
plt.tight_layout()

scatter.set_ylim(len(peripheral_gsea['GO_term_wrapped'].unique()) - 0.5, -0.5)

plt.savefig("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Dotplot_peripheral.svg", format='svg', dpi=2000)

plt.show()
No description has been provided for this image
InĀ [524]:
filtered_gsea
Out[524]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes drug gene_count category
0 KEGG_2021_Human TGF-beta signaling pathway 19/94 0.000308 0.095476 0 0 2.744796 22.192908 TGIF1;TGFB2;INHBB;INHBA;ACVR2B;BMP7;SMAD5;DCN;... Paclitaxel 19 Microtubule inhibitor
1 KEGG_2021_Human RNA degradation 14/79 0.006392 0.711796 0 0 2.327992 11.762601 LSM1;LSM5;TOB1;PATL1;HSPD1;LSM2;EDC4;CNOT4;LSM... Paclitaxel 14 Microtubule inhibitor
2 KEGG_2021_Human PI3K-Akt signaling pathway 44/354 0.007054 0.711796 0 0 1.540904 7.633904 PHLPP1;TNXB;CHRM1;CSF1;LAMC3;IFNA2;PDGFA;PIK3C... Paclitaxel 44 Microtubule inhibitor
3 KEGG_2021_Human Hematopoietic cell lineage 16/99 0.009184 0.711796 0 0 2.083987 9.774403 CSF1;GP1BB;ITGA2;FLT3LG;CD3G;DNTT;EPOR;KITLG;C... Paclitaxel 16 Microtubule inhibitor
4 KEGG_2021_Human ECM-receptor interaction 14/88 0.016214 0.907660 0 0 2.043849 8.424479 TNXB;LAMC3;GP1BB;ITGA2;DMP1;LAMC2;THBS2;IBSP;C... Paclitaxel 14 Microtubule inhibitor
... ... ... ... ... ... ... ... ... ... ... ... ... ...
4613 KEGG_2021_Human Tuberculosis 28/180 0.034162 0.295125 0 0 1.510800 5.101411 CEBPB;ITGAM;SRC;TCIRG1;HSPD1;MRC2;HLA-DMA;AKT2... TAS102 28 Antimetabolite
4614 KEGG_2021_Human Fatty acid degradation 9/43 0.039672 0.333462 0 0 2.166373 6.991107 HADHB;ADH4;ACAA2;ECHS1;ACSL6;ACADM;HADH;ACAT2;... TAS102 9 Antimetabolite
4615 KEGG_2021_Human Adherens junction 13/71 0.042007 0.342269 0 0 1.835265 5.817630 FARP2;TCF7L2;TCF7L1;SMAD3;YES1;SRC;NLK;RHOA;TG... TAS102 13 Antimetabolite
4616 KEGG_2021_Human Thyroid cancer 8/37 0.042921 0.342269 0 0 2.257272 7.106776 TCF7L2;CDKN1A;NRAS;TCF7L1;GADD45B;BAX;POLK;MAPK3 TAS102 8 Antimetabolite
4617 KEGG_2021_Human Circadian rhythm 7/31 0.045901 0.356877 0 0 2.386169 7.352447 PER1;FBXW11;RORB;BTRC;PRKAB1;NPAS2;ARNTL TAS102 7 Antimetabolite

253 rows Ɨ 13 columns

InĀ [865]:
peripheral_gsea 
Out[865]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes drug gene_count category log10_P-value GO_term_wrapped
0 KEGG_2021_Human TGF-beta signaling pathway 19/94 0.000308 0.095476 0 0 3 22.192908 TGIF1;TGFB2;INHBB;INHBA;ACVR2B;BMP7;SMAD5;DCN;... Paclitaxel 19 Microtubule inhibitor 3.511466 TGF-beta signaling pathway
2 KEGG_2021_Human PI3K-Akt signaling pathway 44/354 0.007054 0.711796 0 0 2 7.633904 PHLPP1;TNXB;CHRM1;CSF1;LAMC3;IFNA2;PDGFA;PIK3C... Paclitaxel 44 Microtubule inhibitor 2.151570 PI3K-Akt signaling pathway
323 Reactome_Pathways_2024 RAF Activation 8/34 0.006587 0.797496 0 0 3 16.680278 PPP2CA;CAMK2D;PPP2R1A;CAMK2A;ARAF;PPP2R5A;RAF1... Paclitaxel 8 Microtubule inhibitor 2.181313 RAF Activation
384 Reactome_Pathways_2024 Intrinsic Pathway for Apoptosis 9/55 0.041190 0.900740 0 0 2 6.732095 PPP1R13B;APAF1;DIABLO;STAT3;E2F1;TP53BP2;NMT1;... Paclitaxel 9 Microtubule inhibitor 1.385211 Intrinsic Pathway for Apoptosis
390 Reactome_Pathways_2024 PI3K AKT Signaling in Cancer 15/110 0.045803 0.918209 0 0 2 5.256649 EGF;CD80;MAPKAP1;FLT3LG;FOXO6;PDGFA;PIK3CD;FGF... Paclitaxel 15 Microtubule inhibitor 1.339110 PI3K AKT Signaling in Cancer
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
124533 Reactome_Pathways_2024 Activation of BH3-only Proteins 10/30 0.000898 0.204766 0 0 4 28.742425 YWHAE;TFDP1;BAD;AKT2;AKT3;BCL2;PMAIP1;YWHAZ;BB... TAS102 10 Antimetabolite 3.046676 Activation of BH3-only Proteins
124539 Reactome_Pathways_2024 Intrinsic Pathway for Apoptosis 14/55 0.001878 0.244639 0 0 3 17.576975 YWHAE;BAD;STAT3;GZMB;YWHAZ;BBC3;TFDP1;AKT2;AKT... TAS102 14 Antimetabolite 2.726372 Intrinsic Pathway for Apoptosis
127094 GO_Biological_Process_2025 Positive Regulation of Cell Population Prolife... 76/484 0.000705 0.156055 0 0 2 11.175628 CNTFR;CSF2;KDM1A;BNC1;HTR2B;CIB1;TCIRG1;FGF1;A... TAS102 76 Antimetabolite 3.151827 Positive Regulation of Cell Population\nProlif...
131310 BioPlanet_2019 Myc active pathway 19/85 0.001739 0.196014 0 0 2 15.013030 EIF4A1;SMAD3;CDCA7;HMGA1;HSPD1;PRDX3;FOSL1;KAT... TAS102 19 Antimetabolite 2.759639 Myc active pathway
131440 BioPlanet_2019 Wnt signaling pathway 34/231 0.043339 0.409109 0 0 1 4.443843 NLK;PRKCZ;SOX2;GJA1;CCND3;SOX17;AKT2;HNF4A;AKT... TAS102 34 Antimetabolite 1.363120 Wnt signaling pathway

83 rows Ɨ 15 columns

InĀ [278]:
# Group GO terms by drug
go_to_drugs = (
    filtered_gsea.groupby('GO_term')['drug']
    .apply(set)   # collect drugs per GO term
    .reset_index()
)

# Keep only GO terms associated with >1 drug (common terms)
common_go_terms = go_to_drugs[go_to_drugs['drug'].apply(lambda x: len(x) > 1)].copy()

print(f"Found {len(common_go_terms)} GO terms common to multiple drugs")

# If you want to see how many drugs each term is shared across
common_go_terms['num_drugs'] = common_go_terms['drug'].apply(len)

# Sort by number of drugs sharing the GO term
common_go_terms = common_go_terms.sort_values('num_drugs', ascending=False)
Found 464 GO terms common to multiple drugs
InĀ [186]:
# List of irrelevant keywords to filter out (add/remove as needed)
irrelevant_keywords = [
    # "Glial", "development", "mediated", "Transport", 
    "neuronal", "Translation", "neural", "adipose", 'olfactory', "cancer"
    ]
    # add more if you want to exclude

# Ensure 'Combined Score' is float type
filtered_gsea['Combined Score'] = pd.to_numeric(filtered_gsea['Combined Score'], errors='coerce')

# Drop rows with NaNs in Combined Score just in case
filtered_gsea = filtered_gsea.dropna(subset=['Combined Score'])

# Filter out rows containing any irrelevant keywords in 'GO_term' (case insensitive)
pattern = '|'.join(irrelevant_keywords)
mask = ~filtered_gsea['GO_term'].str.lower().str.contains(pattern)
filtered_gsea = filtered_gsea[mask]

# Get top 2 combined score rows per drug after filtering
top2_by_drug = (
    filtered_gsea
    .sort_values(by=['drug', 'Combined Score'], ascending=[True, False])
    .groupby('drug')
    .head(1)
    .reset_index(drop=True)
)

top2_by_drug_sorted = top2_by_drug.copy()
top2_by_drug_sorted['log10_P-value'] = -np.log10(top2_by_drug_sorted['P-value'])

# Plot
plt.figure(figsize=(30, 30))

sns.scatterplot(
    data=top2_by_drug_sorted,
    x='drug',
    y='GO_term',
    size='Odds Ratio',
    hue='log10_P-value',
    palette='coolwarm_r',
    sizes=(50, 400),
    edgecolor='black',
    legend='brief'
)

# Aesthetics
plt.xticks(rotation=45, ha='right')
plt.xlabel('Drug')
plt.ylabel('Top GO:BP Terms')
plt.tight_layout()
plt.grid(True, linestyle='--', alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()
No description has been provided for this image
InĀ [256]:
moa_keywords = {
    # Microtubule inhibitors
    'Paclitaxel': ['microtubule stabilization', 'mitotic arrest', 'spindle assembly', 'tubulin polymerization', 'chromosome segregation', 'microtubule dynamics', 'microtubule'],
    'Vinblastine': ['microtubule depolymerization', 'spindle disruption', 'tubulin binding', 'mitotic arrest', 'cell cycle', 'chromosome segregation', 'microtubule assembly'],

    # DNA cross-linking agents (alkylating/platinum-based)
    'Cisplatin': ['platinum-based', 'dna crosslink', 'alkylation', 'dna repair', 'adduct formation', 'replication fork stalling', 'apoptosis','autophagy'],
    'Carboplatin': ['platinum-based', 'dna crosslink', 'alkylation', 'dna repair', 'adduct formation', 'replication stress', 'apoptosis', 'autophagy'],
    'Carmustine': ['nitrosourea alkylation', 'dna crosslink', 'alkylation', 'dna damage', 'carbamoylation', 'dna repair inhibition', 'mutagenesis', 'autophagy'],
    'MitomycinC': ['dna crosslink', 'bioreductive activation', 'dna synthesis inhibition', 'hypoxia activation', 'crosslink repair', 'dna damage', 'autophagy'],

    # DNA strand break / topoisomerase inhibitors
    'Doxorubicin': ['topoisomerase inhibition', 'dna intercalation', 'free radical formation', 'oxidative stress', 'apoptosis', 'dna damage', 'autophagy'],
    'Etoposide': ['topoisomerase inhibition', 'dna double strand breaks', 'cell cycle arrest', 'checkpoint activation', 'apoptosis', 'dna damage', 'replication stress', 'autophagy'],
    'Irinotecan': ['topoisomerase inhibition', 'cell cycle arrest', 'dna damage', 'checkpoint activation', 'replication stress', 'apoptosis', 'autophagy'],
    'Bleomycin': ['dna strand cleavage', 'oxidative stress', 'free radical generation', 'dna damage', 'apoptosis', 'cell cycle arrest', 'autophagy'],

    # Antimetabolites
    'TFT': ['WNT', 'cell cycle', 'thymidine', 'nucleotide', 'dna synthesis inhibition', 'folate', 'autophagy'],
    'TAS102': ['nucleoside metabolic inhibition', 'thymidine analog', 'dna synthesis inhibition', 'thymidine kinase', 'replication stress', 'cell cycle arrest', 'lyso'],
    'FdU': ['dna chain termination', 'fluorodeoxyuridine', 'thymidylate synthase inhibition', 'dna synthesis inhibition', 'cell cycle arrest', 'nucleotide analog','autophagy'],
    'EdU': ['dna chain termination', 'thymidine analog', 'dna synthesis', 's-phase progression', 'cell proliferation', 'dna','autophagy'],
    '5FU': ['thymidylate synthase inhibition', 'fluorouracil', 'rna metabolism disruption', 'dna damage', 'cell cycle arrest', 'nucleotide metabolism','autophagy'],
    # '6mercaptopurine': ['DNA', 'purine metabolism inhibition', 'rna synthesis inhibition', 'dna synthesis inhibition', 'thiopurine metabolism', 'immunosuppression', 'cell proliferation inhibition'],
    '6mercaptopurine': ['DNA', 'synthesis', 'purine', 'dna synthesis', 'thiopurine', 'immunosuppression', 'cell proliferation','autophagy'],
}
InĀ [262]:
import pandas as pd

filtered_moa_results = []

for drug, keywords in moa_keywords.items():
    df_drug = combined_gsea[combined_gsea['drug'] == drug].copy()
    df_drug['GO_term_lower'] = df_drug['GO_term'].str.lower()
    pattern = '|'.join(keywords)
    mask = df_drug['GO_term_lower'].str.contains(pattern)
    filtered_drug = df_drug[mask]
    filtered_moa_results.append(filtered_drug)

filtered_moa_gsea = pd.concat(filtered_moa_results, ignore_index=True)

filtered_moa_gsea = filtered_moa_gsea[filtered_moa_gsea['gene_count'] >= 2].copy()
InĀ [224]:
# import pandas as pd

# filtered_moa_results = []

# for drug, keywords in moa_keywords.items():
#     df_drug = filtered_gsea[filtered_gsea['drug'] == drug].copy()
#     df_drug['GO_term_lower'] = df_drug['GO_term'].str.lower()
#     pattern = '|'.join(keywords)
#     mask = df_drug['GO_term_lower'].str.contains(pattern)
#     filtered_drug = df_drug[mask]
#     filtered_moa_results.append(filtered_drug)

# filtered_moa_gsea = pd.concat(filtered_moa_results, ignore_index=True)
# filtered_moa_gsea.drop(columns=['GO_term_lower'], inplace=True)
InĀ [263]:
filtered_moa_gsea
Out[263]:
Gene_set GO_term Overlap P-value p_adj Old P-value Old Adjusted P-value Odds Ratio Combined Score Genes drug gene_count GO_term_lower
0 GO_Biological_Process_2025 Vesicle Transport Along Microtubule (GO:0047496) 3/24 0.084750 0.806303 0 0 3.141049 7.752249 KIF3A;KIF1C;TRAK1 Paclitaxel 3 vesicle transport along microtubule (go:0047496)
2 GO_Biological_Process_2025 Protein Localization to Microtubule Organizing... 2/19 0.199940 0.806303 0 0 2.584314 4.160067 DCTN2;CSNK1D Paclitaxel 2 protein localization to microtubule organizing...
3 GO_Biological_Process_2025 Transport Along Microtubule (GO:0010970) 2/20 0.216135 0.806303 0 0 2.440613 3.738653 BAG3;COPG1 Paclitaxel 2 transport along microtubule (go:0010970)
6 GO_Biological_Process_2025 Regulation of Mitotic Spindle Assembly (GO:190... 2/22 0.248816 0.806303 0 0 2.196322 3.055173 PDCD6IP;VPS4B Paclitaxel 2 regulation of mitotic spindle assembly (go:190...
8 GO_Biological_Process_2025 Regulation of Spindle Assembly (GO:0090169) 2/31 0.394154 0.806303 0 0 1.513991 1.409545 PDCD6IP;VPS4B Paclitaxel 2 regulation of spindle assembly (go:0090169)
... ... ... ... ... ... ... ... ... ... ... ... ... ...
483 GO_Biological_Process_2025 Positive Regulation of Smooth Muscle Cell Prol... 2/49 0.343011 0.673254 0 0 1.679513 1.797067 TGFB1;HTR1B 6mercaptopurine 2 positive regulation of smooth muscle cell prol...
484 GO_Biological_Process_2025 Regulation of Autophagy (GO:0010506) 7/230 0.343498 0.673254 0 0 1.240296 1.325347 SETD2;ENDOG;BAD;LEP;PIP4K2A;SVIP;SIRT2 6mercaptopurine 7 regulation of autophagy (go:0010506)
487 GO_Biological_Process_2025 Regulation of Endothelial Cell Proliferation (... 3/87 0.365308 0.675820 0 0 1.409771 1.419660 SEMA5A;LEP;WNT5A 6mercaptopurine 3 regulation of endothelial cell proliferation (...
491 GO_Biological_Process_2025 Negative Regulation of Autophagy (GO:0010507) 2/65 0.480656 0.713921 0 0 1.251940 0.917175 LEP;SIRT2 6mercaptopurine 2 negative regulation of autophagy (go:0010507)
497 GO_Biological_Process_2025 Macroautophagy (GO:0016236) 3/162 0.768131 0.842545 0 0 0.741908 0.195711 GABARAPL2;STX17;PIP4K2A 6mercaptopurine 3 macroautophagy (go:0016236)

280 rows Ɨ 13 columns

InĀ [259]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# List of irrelevant keywords to filter out (add/remove as needed)
irrelevant_keywords = [
    "cancer", 'export', 'melanoma', 'kanamycin', 'stem', 'Thymus', 'viral', 'cholesterol'
    # add more if you want to exclude
]

# Ensure 'Combined Score' is float type
filtered_moa_gsea['Combined Score'] = pd.to_numeric(filtered_moa_gsea['Combined Score'], errors='coerce')

# Drop rows with NaNs in Combined Score just in case
filtered_gsea = filtered_moa_gsea.dropna(subset=['Combined Score'])

# Filter out rows containing any irrelevant keywords in 'GO_term' (case insensitive)
pattern = '|'.join(irrelevant_keywords)
mask = ~filtered_gsea['GO_term'].str.lower().str.contains(pattern)
filtered_gsea = filtered_gsea[mask]

# Get top 2 combined score rows per drug after filtering
top2_by_drug = (
    filtered_gsea
    .sort_values(by=['drug', 'Combined Score'], ascending=[True, False])
    .groupby('drug')
    .head(2)
    .reset_index(drop=True)
)

top2_by_drug_sorted = top2_by_drug.copy()
top2_by_drug_sorted['log10_P-value'] = -np.log10(top2_by_drug_sorted['P-value'])

# šŸ”¹ Ensure drug column follows custom order
top2_by_drug_sorted['drug'] = pd.Categorical(
    top2_by_drug_sorted['drug'],
    categories=ordered_drug_names,
    ordered=True
)

# Sort by that categorical order so plotting respects it
top2_by_drug_sorted = top2_by_drug_sorted.sort_values('drug')

# Create the plot
plt.figure(figsize=(40, 15))


# Plot with legend=False to prevent seaborn from adding any legend automatically
scatter = sns.scatterplot(
    data=top2_by_drug_sorted,
    x='drug',
    y='GO_term',
    size='Odds Ratio',
    hue='log10_P-value',
    palette='coolwarm_r',
    sizes=(50, 400),
    edgecolor='black',
    legend=False  # Disable seaborn's automatic legend
)

# Add size legend manually
# Create some legend handles for Odds Ratio sizes
import matplotlib.patches as mpatches
from matplotlib.legend_handler import HandlerTuple

# Get some example sizes for the legend
size_values = np.linspace(top2_by_drug_sorted['Odds Ratio'].min(), top2_by_drug_sorted['Odds Ratio'].max(), 3)
markers = []
labels = []
for size in size_values:
    markers.append(plt.scatter([], [], s=(50 + (size - size_values.min()) / (size_values.max() - size_values.min()) * (400-50)), 
                               color='gray', edgecolor='black'))
    labels.append(f"{size:.2f}")

# Create the legend for size
plt.legend(markers, labels, title='Odds Ratio', bbox_to_anchor=(0.75, 1), loc='upper left')

# Add colorbar for -log10(P-value)
norm = mcolors.Normalize(vmin=top2_by_drug_sorted['log10_P-value'].min(),
                         vmax=top2_by_drug_sorted['log10_P-value'].max())
sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=scatter)
cbar.set_label('-log10(P-value)', rotation=270, labelpad=50)

plt.xticks(rotation=45, ha='right')
plt.xlabel('Drug')
plt.ylabel('Top Two REACTOME Term')
# plt.grid(True, linestyle='--', alpha=0.3)
plt.grid(False)
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [179]:
CCDC86_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20250805_Figure4_Survival.xlsx", sheet_name ='CCDC86')

GHSR_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20250805_Figure4_Survival.xlsx", sheet_name ='GHSR')
InĀ [180]:
data = CCDC86_df.iloc[3:].copy()

# Convert to numeric
data['Survival Rate'] = pd.to_numeric(data['Unnamed: 4'], errors='coerce')
data['Time (months)'] = pd.to_numeric(data['Unnamed: 5'], errors='coerce')
data['Type'] = data['Unnamed: 6']

# Convert months to years
data['Time (years)'] = data['Time (months)'] / 12

# Count samples per group
group_counts = data['Type'].value_counts().to_dict()

colors = {'High': '#007A03', 'Low': '#FFA90E'}
Cutoff = -1.6

plt.figure(figsize=(8, 8))

for group in ['High', 'Low']:
    group_data = data[data['Type'] == group].sort_values('Time (years)')
    plt.step(group_data['Time (years)'], group_data['Survival Rate'], where='post', 
             color=colors[group], linewidth=7, label=f"{group} (n={group_counts.get(group,0)})")

legend_handles = [Line2D([0], [0], color=colors[g], lw=3, label=f"{g} (n={group_counts.get(g,0)})") for g in ['High', 'Low']]
# legend_handles.append(Line2D([0], [0], color='none', label=f"Cutoff = {Cutoff}"))
legend_handles.append(Line2D([0], [0], color='none', label=r"$P < 10^{-10}$"))

plt.legend(handles=legend_handles, loc='upper right', frameon=False, fontsize=14)
plt.title("CCDC86", fontsize=20)
plt.xlabel("Time (years)", fontsize=16)
plt.ylabel("Survival Probability", fontsize=16)
plt.ylim(0, 1.05)
plt.xlim(0, data['Time (years)'].max())
plt.grid(False)
plt.tight_layout()

plt.savefig("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_CCDC86.svg", format='svg', dpi=1000)

plt.show()
No description has been provided for this image
InĀ [181]:
data = GHSR_df.iloc[3:].copy()

# Convert to numeric
data['Survival Rate'] = pd.to_numeric(data['Unnamed: 4'], errors='coerce')
data['Time (months)'] = pd.to_numeric(data['Unnamed: 5'], errors='coerce')
data['Type'] = data['Unnamed: 6']

# Convert months to years
data['Time (years)'] = data['Time (months)'] / 12

# Count samples per group
group_counts = data['Type'].value_counts().to_dict()

colors = {'High': '#007A03', 'Low': '#FFA90E'}
Cutoff = -4.28

plt.figure(figsize=(8, 8))

for group in ['High', 'Low']:
    group_data = data[data['Type'] == group].sort_values('Time (years)')
    plt.step(group_data['Time (years)'], group_data['Survival Rate'], where='post', 
             color=colors[group], linewidth=7, label=f"{group} (n={group_counts.get(group,0)})")

legend_handles = [Line2D([0], [0], color=colors[g], lw=3, label=f"{g} (n={group_counts.get(g,0)})") for g in ['High', 'Low']]
# legend_handles.append(Line2D([0], [0], color='none', label=f"Cutoff = {Cutoff}"))
legend_handles.append(Line2D([0], [0], color='none', label=r"$P < 10^{-10}$"))

plt.legend(handles=legend_handles, loc='upper right', frameon=False, fontsize=14)
plt.title("GHSR", fontsize=20)
plt.xlabel("Time (years)", fontsize=16)
plt.ylabel("Survival Probability", fontsize=16)
plt.ylim(0, 1.05)
plt.xlim(0, data['Time (years)'].max())
plt.grid(False)
plt.tight_layout()

plt.savefig("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_GHSR.svg", format='svg', dpi=1000)
    
plt.show()
No description has been provided for this image
InĀ [182]:
# def plot_gene_counts_barchart(df: pd.DataFrame, drug_name_list: list):
#     """
#     Generates a single grouped bar chart showing the number of up-regulated
#     and down-regulated genes for each drug.

#     Args:
#         df (pd.DataFrame): The DataFrame containing the drug data.
#         drug_name_list (list): A list of drug names to iterate through.
#     """
#     up_counts = []
#     down_counts = []

#     for drug_name in drug_name_list:
#         nan_col = f'nan_filter_{drug_name}'
#         fc_col = f'FC_{drug_name}'
#         pval_col = f'ovp3_{drug_name}'

#         # Skip if columns are missing for a drug
#         if not all(col in df.columns for col in [nan_col, fc_col, pval_col]):
#             print(f"Warning: Skipping {drug_name} as one or more required columns are missing.")
#             up_counts.append(0)
#             down_counts.append(0)
#             continue

#         # Filter for up-regulated genes (FC >= 0.5)
#         up_regulated = df[
#             (df[nan_col] == 1) &
#             (df[fc_col] >= 0.5) &
#             (df[pval_col] <= 0.05)
#         ]
#         up_counts.append(len(up_regulated))

#         # Filter for down-regulated genes (FC <= -0.5)
#         down_regulated = df[
#             (df[nan_col] == 1) &
#             (df[fc_col] <= -0.5) &
#             (df[pval_col] <= 0.05)
#         ]
#         down_counts.append(len(down_regulated))

#     # Set up the plot
#     bar_width = 0.35
#     index = np.arange(len(drug_name_list))

#     fig, ax = plt.subplots(figsize=(16, 16))
#     bar1 = ax.bar(index, up_counts, bar_width, label='Up-regulated (>0.5)', color='skyblue')
#     bar2 = ax.bar(index + bar_width, down_counts, bar_width, label='Down-regulated (<-0.5)', color='salmon')

#     # Add text labels on top of the bars
#     def add_labels(bars):
#         for bar in bars:
#             height = bar.get_height()
#             if height > 0:
#                 ax.text(bar.get_x() + bar.get_width() / 2., height,
#                         '%d' % int(height),
#                         ha='center', va='bottom')

#     add_labels(bar1)
#     add_labels(bar2)

#     # Customize the plot
#     ax.set_xlabel('Drug Name', fontsize=12)
#     ax.set_ylabel('Number of Genes', fontsize=12)
#     ax.set_title('Number of Up-regulated and Down-regulated Genes per Drug', fontsize=16)
#     ax.set_xticks(index + bar_width / 2)
#     ax.set_xticklabels(drug_name_list, rotation=90, ha='center')
#     ax.legend()
#     ax.grid(axis='y', linestyle='--', alpha=0.6)

#     plt.tight_layout()
#     plt.show()